One of the questions I try to answer in this notebook is whether there is gender inequality in the Tech space. Assuming there is some, which there probably is seeing as a good percentage of survey respondents are males, how does this impact earning potential.
In this notebook I:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
# load data
df = pd.read_csv('kaggle_survey_2020_responses.csv')
display(df.shape)
# remove the top row
df_fin = df.iloc[1:,:]
C:\Users\ugonn\anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3165: DtypeWarning: Columns (0) have mixed types.Specify dtype option on import or set low_memory=False. has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
(20037, 355)
# inspect the data and questions
df.head()
| Time from Start to Finish (seconds) | Q1 | Q2 | Q3 | Q4 | Q5 | Q6 | Q7_Part_1 | Q7_Part_2 | Q7_Part_3 | Q7_Part_4 | Q7_Part_5 | Q7_Part_6 | Q7_Part_7 | Q7_Part_8 | Q7_Part_9 | Q7_Part_10 | Q7_Part_11 | Q7_Part_12 | Q7_OTHER | Q8 | Q9_Part_1 | Q9_Part_2 | Q9_Part_3 | Q9_Part_4 | Q9_Part_5 | Q9_Part_6 | Q9_Part_7 | Q9_Part_8 | Q9_Part_9 | Q9_Part_10 | Q9_Part_11 | Q9_OTHER | Q10_Part_1 | Q10_Part_2 | Q10_Part_3 | Q10_Part_4 | Q10_Part_5 | Q10_Part_6 | Q10_Part_7 | Q10_Part_8 | Q10_Part_9 | Q10_Part_10 | Q10_Part_11 | Q10_Part_12 | Q10_Part_13 | Q10_OTHER | Q11 | Q12_Part_1 | Q12_Part_2 | Q12_Part_3 | Q12_OTHER | Q13 | Q14_Part_1 | Q14_Part_2 | Q14_Part_3 | Q14_Part_4 | Q14_Part_5 | Q14_Part_6 | Q14_Part_7 | Q14_Part_8 | Q14_Part_9 | Q14_Part_10 | Q14_Part_11 | Q14_OTHER | Q15 | Q16_Part_1 | Q16_Part_2 | Q16_Part_3 | Q16_Part_4 | Q16_Part_5 | Q16_Part_6 | Q16_Part_7 | Q16_Part_8 | Q16_Part_9 | Q16_Part_10 | Q16_Part_11 | Q16_Part_12 | Q16_Part_13 | Q16_Part_14 | Q16_Part_15 | Q16_OTHER | Q17_Part_1 | Q17_Part_2 | Q17_Part_3 | Q17_Part_4 | Q17_Part_5 | Q17_Part_6 | Q17_Part_7 | Q17_Part_8 | Q17_Part_9 | Q17_Part_10 | Q17_Part_11 | Q17_OTHER | Q18_Part_1 | Q18_Part_2 | Q18_Part_3 | Q18_Part_4 | Q18_Part_5 | Q18_Part_6 | Q18_OTHER | Q19_Part_1 | Q19_Part_2 | Q19_Part_3 | Q19_Part_4 | Q19_Part_5 | Q19_OTHER | Q20 | Q21 | Q22 | Q23_Part_1 | Q23_Part_2 | Q23_Part_3 | Q23_Part_4 | Q23_Part_5 | Q23_Part_6 | Q23_Part_7 | Q23_OTHER | Q24 | Q25 | Q26_A_Part_1 | Q26_A_Part_2 | Q26_A_Part_3 | Q26_A_Part_4 | Q26_A_Part_5 | Q26_A_Part_6 | Q26_A_Part_7 | Q26_A_Part_8 | Q26_A_Part_9 | Q26_A_Part_10 | Q26_A_Part_11 | Q26_A_OTHER | Q27_A_Part_1 | Q27_A_Part_2 | Q27_A_Part_3 | Q27_A_Part_4 | Q27_A_Part_5 | Q27_A_Part_6 | Q27_A_Part_7 | Q27_A_Part_8 | Q27_A_Part_9 | Q27_A_Part_10 | Q27_A_Part_11 | Q27_A_OTHER | Q28_A_Part_1 | Q28_A_Part_2 | Q28_A_Part_3 | Q28_A_Part_4 | Q28_A_Part_5 | Q28_A_Part_6 | Q28_A_Part_7 | Q28_A_Part_8 | Q28_A_Part_9 | Q28_A_Part_10 | Q28_A_OTHER | Q29_A_Part_1 | Q29_A_Part_2 | Q29_A_Part_3 | Q29_A_Part_4 | Q29_A_Part_5 | Q29_A_Part_6 | Q29_A_Part_7 | Q29_A_Part_8 | Q29_A_Part_9 | Q29_A_Part_10 | Q29_A_Part_11 | Q29_A_Part_12 | Q29_A_Part_13 | Q29_A_Part_14 | Q29_A_Part_15 | Q29_A_Part_16 | Q29_A_Part_17 | Q29_A_OTHER | Q30 | Q31_A_Part_1 | Q31_A_Part_2 | Q31_A_Part_3 | Q31_A_Part_4 | Q31_A_Part_5 | Q31_A_Part_6 | Q31_A_Part_7 | Q31_A_Part_8 | Q31_A_Part_9 | Q31_A_Part_10 | Q31_A_Part_11 | Q31_A_Part_12 | Q31_A_Part_13 | Q31_A_Part_14 | Q31_A_OTHER | Q32 | Q33_A_Part_1 | Q33_A_Part_2 | Q33_A_Part_3 | Q33_A_Part_4 | Q33_A_Part_5 | Q33_A_Part_6 | Q33_A_Part_7 | Q33_A_OTHER | Q34_A_Part_1 | Q34_A_Part_2 | Q34_A_Part_3 | Q34_A_Part_4 | Q34_A_Part_5 | Q34_A_Part_6 | Q34_A_Part_7 | Q34_A_Part_8 | Q34_A_Part_9 | Q34_A_Part_10 | Q34_A_Part_11 | Q34_A_OTHER | Q35_A_Part_1 | Q35_A_Part_2 | Q35_A_Part_3 | Q35_A_Part_4 | Q35_A_Part_5 | Q35_A_Part_6 | Q35_A_Part_7 | Q35_A_Part_8 | Q35_A_Part_9 | Q35_A_Part_10 | Q35_A_OTHER | Q36_Part_1 | Q36_Part_2 | Q36_Part_3 | Q36_Part_4 | Q36_Part_5 | Q36_Part_6 | Q36_Part_7 | Q36_Part_8 | Q36_Part_9 | Q36_OTHER | Q37_Part_1 | Q37_Part_2 | Q37_Part_3 | Q37_Part_4 | Q37_Part_5 | Q37_Part_6 | Q37_Part_7 | Q37_Part_8 | Q37_Part_9 | Q37_Part_10 | Q37_Part_11 | Q37_OTHER | Q38 | Q39_Part_1 | Q39_Part_2 | Q39_Part_3 | Q39_Part_4 | Q39_Part_5 | Q39_Part_6 | Q39_Part_7 | Q39_Part_8 | Q39_Part_9 | Q39_Part_10 | Q39_Part_11 | Q39_OTHER | Q26_B_Part_1 | Q26_B_Part_2 | Q26_B_Part_3 | Q26_B_Part_4 | Q26_B_Part_5 | Q26_B_Part_6 | Q26_B_Part_7 | Q26_B_Part_8 | Q26_B_Part_9 | Q26_B_Part_10 | Q26_B_Part_11 | Q26_B_OTHER | Q27_B_Part_1 | Q27_B_Part_2 | Q27_B_Part_3 | Q27_B_Part_4 | Q27_B_Part_5 | Q27_B_Part_6 | Q27_B_Part_7 | Q27_B_Part_8 | Q27_B_Part_9 | Q27_B_Part_10 | Q27_B_Part_11 | Q27_B_OTHER | Q28_B_Part_1 | Q28_B_Part_2 | Q28_B_Part_3 | Q28_B_Part_4 | Q28_B_Part_5 | Q28_B_Part_6 | Q28_B_Part_7 | Q28_B_Part_8 | Q28_B_Part_9 | Q28_B_Part_10 | Q28_B_OTHER | Q29_B_Part_1 | Q29_B_Part_2 | Q29_B_Part_3 | Q29_B_Part_4 | Q29_B_Part_5 | Q29_B_Part_6 | Q29_B_Part_7 | Q29_B_Part_8 | Q29_B_Part_9 | Q29_B_Part_10 | Q29_B_Part_11 | Q29_B_Part_12 | Q29_B_Part_13 | Q29_B_Part_14 | Q29_B_Part_15 | Q29_B_Part_16 | Q29_B_Part_17 | Q29_B_OTHER | Q31_B_Part_1 | Q31_B_Part_2 | Q31_B_Part_3 | Q31_B_Part_4 | Q31_B_Part_5 | Q31_B_Part_6 | Q31_B_Part_7 | Q31_B_Part_8 | Q31_B_Part_9 | Q31_B_Part_10 | Q31_B_Part_11 | Q31_B_Part_12 | Q31_B_Part_13 | Q31_B_Part_14 | Q31_B_OTHER | Q33_B_Part_1 | Q33_B_Part_2 | Q33_B_Part_3 | Q33_B_Part_4 | Q33_B_Part_5 | Q33_B_Part_6 | Q33_B_Part_7 | Q33_B_OTHER | Q34_B_Part_1 | Q34_B_Part_2 | Q34_B_Part_3 | Q34_B_Part_4 | Q34_B_Part_5 | Q34_B_Part_6 | Q34_B_Part_7 | Q34_B_Part_8 | Q34_B_Part_9 | Q34_B_Part_10 | Q34_B_Part_11 | Q34_B_OTHER | Q35_B_Part_1 | Q35_B_Part_2 | Q35_B_Part_3 | Q35_B_Part_4 | Q35_B_Part_5 | Q35_B_Part_6 | Q35_B_Part_7 | Q35_B_Part_8 | Q35_B_Part_9 | Q35_B_Part_10 | Q35_B_OTHER | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Duration (in seconds) | What is your age (# years)? | What is your gender? - Selected Choice | In which country do you currently reside? | What is the highest level of formal education ... | Select the title most similar to your current ... | For how many years have you been writing code ... | What programming languages do you use on a reg... | What programming languages do you use on a reg... | What programming languages do you use on a reg... | What programming languages do you use on a reg... | What programming languages do you use on a reg... | What programming languages do you use on a reg... | What programming languages do you use on a reg... | What programming languages do you use on a reg... | What programming languages do you use on a reg... | What programming languages do you use on a reg... | What programming languages do you use on a reg... | What programming languages do you use on a reg... | What programming languages do you use on a reg... | What programming language would you recommend ... | Which of the following integrated development ... | Which of the following integrated development ... | Which of the following integrated development ... | Which of the following integrated development ... | Which of the following integrated development ... | Which of the following integrated development ... | Which of the following integrated development ... | Which of the following integrated development ... | Which of the following integrated development ... | Which of the following integrated development ... | Which of the following integrated development ... | Which of the following integrated development ... | Which of the following hosted notebook product... | Which of the following hosted notebook product... | Which of the following hosted notebook product... | Which of the following hosted notebook product... | Which of the following hosted notebook product... | Which of the following hosted notebook product... | Which of the following hosted notebook product... | Which of the following hosted notebook product... | Which of the following hosted notebook product... | Which of the following hosted notebook product... | Which of the following hosted notebook product... | Which of the following hosted notebook product... | Which of the following hosted notebook product... | Which of the following hosted notebook product... | What type of computing platform do you use mos... | Which types of specialized hardware do you use... | Which types of specialized hardware do you use... | Which types of specialized hardware do you use... | Which types of specialized hardware do you use... | Approximately how many times have you used a T... | What data visualization libraries or tools do ... | What data visualization libraries or tools do ... | What data visualization libraries or tools do ... | What data visualization libraries or tools do ... | What data visualization libraries or tools do ... | What data visualization libraries or tools do ... | What data visualization libraries or tools do ... | What data visualization libraries or tools do ... | What data visualization libraries or tools do ... | What data visualization libraries or tools do ... | What data visualization libraries or tools do ... | What data visualization libraries or tools do ... | For how many years have you used machine learn... | Which of the following machine learning framew... | Which of the following machine learning framew... | Which of the following machine learning framew... | Which of the following machine learning framew... | Which of the following machine learning framew... | Which of the following machine learning framew... | Which of the following machine learning framew... | Which of the following machine learning framew... | Which of the following machine learning framew... | Which of the following machine learning framew... | Which of the following machine learning framew... | Which of the following machine learning framew... | Which of the following machine learning framew... | Which of the following machine learning framew... | Which of the following machine learning framew... | Which of the following machine learning framew... | Which of the following ML algorithms do you us... | Which of the following ML algorithms do you us... | Which of the following ML algorithms do you us... | Which of the following ML algorithms do you us... | Which of the following ML algorithms do you us... | Which of the following ML algorithms do you us... | Which of the following ML algorithms do you us... | Which of the following ML algorithms do you us... | Which of the following ML algorithms do you us... | Which of the following ML algorithms do you us... | Which of the following ML algorithms do you us... | Which of the following ML algorithms do you us... | Which categories of computer vision methods do... | Which categories of computer vision methods do... | Which categories of computer vision methods do... | Which categories of computer vision methods do... | Which categories of computer vision methods do... | Which categories of computer vision methods do... | Which categories of computer vision methods do... | Which of the following natural language proces... | Which of the following natural language proces... | Which of the following natural language proces... | Which of the following natural language proces... | Which of the following natural language proces... | Which of the following natural language proces... | What is the size of the company where you are ... | Approximately how many individuals are respons... | Does your current employer incorporate machine... | Select any activities that make up an importan... | Select any activities that make up an importan... | Select any activities that make up an importan... | Select any activities that make up an importan... | Select any activities that make up an importan... | Select any activities that make up an importan... | Select any activities that make up an importan... | Select any activities that make up an importan... | What is your current yearly compensation (appr... | Approximately how much money have you (or your... | Which of the following cloud computing platfor... | Which of the following cloud computing platfor... | Which of the following cloud computing platfor... | Which of the following cloud computing platfor... | Which of the following cloud computing platfor... | Which of the following cloud computing platfor... | Which of the following cloud computing platfor... | Which of the following cloud computing platfor... | Which of the following cloud computing platfor... | Which of the following cloud computing platfor... | Which of the following cloud computing platfor... | Which of the following cloud computing platfor... | Do you use any of the following cloud computin... | Do you use any of the following cloud computin... | Do you use any of the following cloud computin... | Do you use any of the following cloud computin... | Do you use any of the following cloud computin... | Do you use any of the following cloud computin... | Do you use any of the following cloud computin... | Do you use any of the following cloud computin... | Do you use any of the following cloud computin... | Do you use any of the following cloud computin... | Do you use any of the following cloud computin... | Do you use any of the following cloud computin... | Do you use any of the following machine learni... | Do you use any of the following machine learni... | Do you use any of the following machine learni... | Do you use any of the following machine learni... | Do you use any of the following machine learni... | Do you use any of the following machine learni... | Do you use any of the following machine learni... | Do you use any of the following machine learni... | Do you use any of the following machine learni... | Do you use any of the following machine learni... | Do you use any of the following machine learni... | Which of the following big data products (rela... | Which of the following big data products (rela... | Which of the following big data products (rela... | Which of the following big data products (rela... | Which of the following big data products (rela... | Which of the following big data products (rela... | Which of the following big data products (rela... | Which of the following big data products (rela... | Which of the following big data products (rela... | Which of the following big data products (rela... | Which of the following big data products (rela... | Which of the following big data products (rela... | Which of the following big data products (rela... | Which of the following big data products (rela... | Which of the following big data products (rela... | Which of the following big data products (rela... | Which of the following big data products (rela... | Which of the following big data products (rela... | Which of the following big data products (rela... | Which of the following business intelligence t... | Which of the following business intelligence t... | Which of the following business intelligence t... | Which of the following business intelligence t... | Which of the following business intelligence t... | Which of the following business intelligence t... | Which of the following business intelligence t... | Which of the following business intelligence t... | Which of the following business intelligence t... | Which of the following business intelligence t... | Which of the following business intelligence t... | Which of the following business intelligence t... | Which of the following business intelligence t... | Which of the following business intelligence t... | Which of the following business intelligence t... | Which of the following business intelligence t... | Do you use any automated machine learning tool... | Do you use any automated machine learning tool... | Do you use any automated machine learning tool... | Do you use any automated machine learning tool... | Do you use any automated machine learning tool... | Do you use any automated machine learning tool... | Do you use any automated machine learning tool... | Do you use any automated machine learning tool... | Which of the following automated machine learn... | Which of the following automated machine learn... | Which of the following automated machine learn... | Which of the following automated machine learn... | Which of the following automated machine learn... | Which of the following automated machine learn... | Which of the following automated machine learn... | Which of the following automated machine learn... | Which of the following automated machine learn... | Which of the following automated machine learn... | Which of the following automated machine learn... | Which of the following automated machine learn... | Do you use any tools to help manage machine le... | Do you use any tools to help manage machine le... | Do you use any tools to help manage machine le... | Do you use any tools to help manage machine le... | Do you use any tools to help manage machine le... | Do you use any tools to help manage machine le... | Do you use any tools to help manage machine le... | Do you use any tools to help manage machine le... | Do you use any tools to help manage machine le... | Do you use any tools to help manage machine le... | Do you use any tools to help manage machine le... | Where do you publicly share or deploy your dat... | Where do you publicly share or deploy your dat... | Where do you publicly share or deploy your dat... | Where do you publicly share or deploy your dat... | Where do you publicly share or deploy your dat... | Where do you publicly share or deploy your dat... | Where do you publicly share or deploy your dat... | Where do you publicly share or deploy your dat... | Where do you publicly share or deploy your dat... | Where do you publicly share or deploy your dat... | On which platforms have you begun or completed... | On which platforms have you begun or completed... | On which platforms have you begun or completed... | On which platforms have you begun or completed... | On which platforms have you begun or completed... | On which platforms have you begun or completed... | On which platforms have you begun or completed... | On which platforms have you begun or completed... | On which platforms have you begun or completed... | On which platforms have you begun or completed... | On which platforms have you begun or completed... | On which platforms have you begun or completed... | What is the primary tool that you use at work ... | Who/what are your favorite media sources that ... | Who/what are your favorite media sources that ... | Who/what are your favorite media sources that ... | Who/what are your favorite media sources that ... | Who/what are your favorite media sources that ... | Who/what are your favorite media sources that ... | Who/what are your favorite media sources that ... | Who/what are your favorite media sources that ... | Who/what are your favorite media sources that ... | Who/what are your favorite media sources that ... | Who/what are your favorite media sources that ... | Who/what are your favorite media sources that ... | Which of the following cloud computing platfor... | Which of the following cloud computing platfor... | Which of the following cloud computing platfor... | Which of the following cloud computing platfor... | Which of the following cloud computing platfor... | Which of the following cloud computing platfor... | Which of the following cloud computing platfor... | Which of the following cloud computing platfor... | Which of the following cloud computing platfor... | Which of the following cloud computing platfor... | Which of the following cloud computing platfor... | Which of the following cloud computing platfor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | Which of the following big data products (rela... | Which of the following big data products (rela... | Which of the following big data products (rela... | Which of the following big data products (rela... | Which of the following big data products (rela... | Which of the following big data products (rela... | Which of the following big data products (rela... | Which of the following big data products (rela... | Which of the following big data products (rela... | Which of the following big data products (rela... | Which of the following big data products (rela... | Which of the following big data products (rela... | Which of the following big data products (rela... | Which of the following big data products (rela... | Which of the following big data products (rela... | Which of the following big data products (rela... | Which of the following big data products (rela... | Which of the following big data products (rela... | Which of the following business intelligence t... | Which of the following business intelligence t... | Which of the following business intelligence t... | Which of the following business intelligence t... | Which of the following business intelligence t... | Which of the following business intelligence t... | Which of the following business intelligence t... | Which of the following business intelligence t... | Which of the following business intelligence t... | Which of the following business intelligence t... | Which of the following business intelligence t... | Which of the following business intelligence t... | Which of the following business intelligence t... | Which of the following business intelligence t... | Which of the following business intelligence t... | Which categories of automated machine learning... | Which categories of automated machine learning... | Which categories of automated machine learning... | Which categories of automated machine learning... | Which categories of automated machine learning... | Which categories of automated machine learning... | Which categories of automated machine learning... | Which categories of automated machine learning... | Which specific automated machine learning tool... | Which specific automated machine learning tool... | Which specific automated machine learning tool... | Which specific automated machine learning tool... | Which specific automated machine learning tool... | Which specific automated machine learning tool... | Which specific automated machine learning tool... | Which specific automated machine learning tool... | Which specific automated machine learning tool... | Which specific automated machine learning tool... | Which specific automated machine learning tool... | Which specific automated machine learning tool... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... |
| 1 | 1838 | 35-39 | Man | Colombia | Doctoral degree | Student | 5-10 years | Python | R | SQL | C | NaN | NaN | Javascript | NaN | NaN | NaN | MATLAB | NaN | Other | Python | Jupyter (JupyterLab, Jupyter Notebooks, etc) | NaN | NaN | Visual Studio Code (VSCode) | NaN | Spyder | NaN | NaN | NaN | NaN | NaN | NaN | Kaggle Notebooks | Colab Notebooks | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | A cloud computing platform (AWS, Azure, GCP, h... | GPUs | NaN | NaN | NaN | 2-5 times | Matplotlib | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Geoplotlib | NaN | NaN | NaN | 1-2 years | NaN | TensorFlow | Keras | NaN | NaN | NaN | Xgboost | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Decision Trees or Random Forests | Gradient Boosting Machines (xgboost, lightgbm,... | Bayesian Approaches | NaN | Dense Neural Networks (MLPs, etc) | Convolutional Neural Networks | NaN | Recurrent Neural Networks | NaN | NaN | NaN | NaN | NaN | NaN | Image classification and other general purpose... | NaN | NaN | NaN | Word embeddings/vectors (GLoVe, fastText, word... | NaN | Contextualized embeddings (ELMo, CoVe) | Transformer language models (GPT-3, BERT, XLne... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Coursera | NaN | Kaggle Learn Courses | NaN | NaN | NaN | NaN | NaN | NaN | University Courses (resulting in a university ... | NaN | NaN | Basic statistical software (Microsoft Excel, G... | NaN | NaN | NaN | Kaggle (notebooks, forums, etc) | NaN | NaN | NaN | NaN | Journal Publications (peer-reviewed journals, ... | NaN | NaN | NaN | Amazon Web Services (AWS) | Microsoft Azure | Google Cloud Platform (GCP) | IBM Cloud / Red Hat | NaN | SAP Cloud | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Azure Cloud Services | Microsoft Azure Container Instances | Azure Functions | Google Cloud Compute Engine | Google Cloud Functions | Google Cloud Run | Google Cloud App Engine | NaN | NaN | Amazon SageMaker | Amazon Forecast | Amazon Rekognition | Azure Machine Learning Studio | Azure Cognitive Services | Google Cloud AI Platform / Google Cloud ML En... | Google Cloud Video AI | Google Cloud Natural Language | Google Cloud Vision AI | NaN | NaN | NaN | NaN | NaN | NaN | MongoDB | NaN | NaN | Microsoft SQL Server | NaN | NaN | NaN | NaN | NaN | Google Cloud BigQuery | Google Cloud SQL | Google Cloud Firestore | NaN | NaN | Microsoft Power BI | Amazon QuickSight | Google Data Studio | NaN | Tableau | NaN | NaN | NaN | NaN | NaN | NaN | NaN | SAP Analytics Cloud | NaN | NaN | Automated data augmentation (e.g. imgaug, albu... | NaN | NaN | NaN | Automated hyperparameter tuning (e.g. hyperopt... | Automation of full ML pipelines (e.g. Google C... | NaN | NaN | Google Cloud AutoML | NaN | Databricks AutoML | NaN | NaN | Auto-Keras | Auto-Sklearn | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | TensorBoard | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 289287 | 30-34 | Man | United States of America | Master’s degree | Data Engineer | 5-10 years | Python | R | SQL | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Python | NaN | NaN | Visual Studio | NaN | PyCharm | NaN | NaN | Sublime Text | NaN | NaN | NaN | NaN | NaN | Colab Notebooks | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | A personal computer or laptop | GPUs | NaN | NaN | NaN | 2-5 times | Matplotlib | Seaborn | NaN | Ggplot / ggplot2 | Shiny | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1-2 years | Scikit-learn | TensorFlow | Keras | PyTorch | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Linear or Logistic Regression | NaN | NaN | NaN | NaN | NaN | Convolutional Neural Networks | NaN | NaN | Transformer Networks (BERT, gpt-3, etc) | NaN | NaN | NaN | Image segmentation methods (U-Net, Mask R-CNN,... | NaN | Image classification and other general purpose... | NaN | NaN | NaN | NaN | NaN | Contextualized embeddings (ELMo, CoVe) | Transformer language models (GPT-3, BERT, XLne... | NaN | NaN | 10,000 or more employees | 20+ | We have well established ML methods (i.e., mod... | Analyze and understand data to influence produ... | NaN | NaN | NaN | NaN | Do research that advances the state of the art... | NaN | NaN | 100,000-124,999 | $100,000 or more ($USD) | Amazon Web Services (AWS) | Microsoft Azure | Google Cloud Platform (GCP) | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Amazon EC2 | AWS Lambda | NaN | NaN | NaN | Azure Functions | Google Cloud Compute Engine | NaN | NaN | NaN | NaN | NaN | Amazon SageMaker | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | PostgresSQL | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Amazon Redshift | Amazon Athena | NaN | NaN | NaN | NaN | NaN | NaN | PostgresSQL | Amazon QuickSight | Microsoft Power BI | NaN | NaN | Tableau | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Microsoft Power BI | NaN | NaN | NaN | NaN | NaN | NaN | No / None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | No / None | NaN | NaN | NaN | NaN | GitHub | NaN | NaN | NaN | NaN | NaN | NaN | Coursera | NaN | NaN | DataCamp | NaN | NaN | Udemy | NaN | NaN | NaN | NaN | NaN | Business intelligence software (Salesforce, Ta... | Twitter (data science influencers) | NaN | Reddit (r/machinelearning, etc) | Kaggle (notebooks, forums, etc) | Course Forums (forums.fast.ai, Coursera forums... | YouTube (Kaggle YouTube, Cloud AI Adventures, ... | NaN | Blogs (Towards Data Science, Analytics Vidhya,... | NaN | Slack Communities (ods.ai, kagglenoobs, etc) | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 860 | 35-39 | Man | Argentina | Bachelor’s degree | Software Engineer | 10-20 years | NaN | NaN | NaN | NaN | NaN | Java | Javascript | NaN | NaN | Bash | NaN | NaN | NaN | R | NaN | NaN | NaN | Visual Studio Code (VSCode) | NaN | NaN | Notepad++ | Sublime Text | Vim / Emacs | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | None | NaN | A personal computer or laptop | NaN | NaN | None | NaN | Never | NaN | NaN | NaN | NaN | NaN | D3 js | NaN | NaN | NaN | NaN | NaN | NaN | I do not use machine learning methods | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1000-9,999 employees | 0 | No (we do not use ML methods) | NaN | NaN | NaN | NaN | NaN | NaN | None of these activities are an important part... | NaN | 15,000-19,999 | $0 ($USD) | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | MySQL | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | No / None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | No / None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Coursera | edX | NaN | NaN | NaN | Udacity | Udemy | NaN | NaN | NaN | NaN | NaN | Basic statistical software (Microsoft Excel, G... | NaN | Email newsletters (Data Elixir, O'Reilly Data ... | NaN | Kaggle (notebooks, forums, etc) | NaN | YouTube (Kaggle YouTube, Cloud AI Adventures, ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | MySQL | NaN | NaN | NaN | NaN | NaN | NaN | Microsoft SQL Server | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | None | NaN |
| 4 | 507 | 30-34 | Man | United States of America | Master’s degree | Data Scientist | 5-10 years | Python | NaN | SQL | NaN | NaN | NaN | NaN | NaN | NaN | Bash | NaN | NaN | NaN | Python | NaN | NaN | NaN | NaN | PyCharm | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | None | NaN | A cloud computing platform (AWS, Azure, GCP, h... | NaN | NaN | None | NaN | 2-5 times | Matplotlib | Seaborn | Plotly / Plotly Express | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 3-4 years | Scikit-learn | TensorFlow | Keras | NaN | NaN | NaN | Xgboost | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Linear or Logistic Regression | Decision Trees or Random Forests | Gradient Boosting Machines (xgboost, lightgbm,... | Bayesian Approaches | NaN | Dense Neural Networks (MLPs, etc) | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 250-999 employees | 5-9 | We have well established ML methods (i.e., mod... | Analyze and understand data to influence produ... | NaN | NaN | Build and/or run a machine learning service th... | Experimentation and iteration to improve exist... | NaN | NaN | NaN | 125,000-149,999 | $10,000-$99,999 | Amazon Web Services (AWS) | NaN | NaN | NaN | NaN | NaN | Salesforce Cloud | NaN | NaN | NaN | NaN | NaN | Amazon EC2 | AWS Lambda | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | No / None | NaN | MySQL | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Amazon Redshift | NaN | NaN | NaN | NaN | NaN | NaN | NaN | MySQL | NaN | NaN | NaN | NaN | Tableau | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | No / None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | No / None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | I do not share my work publicly | NaN | Coursera | NaN | NaN | DataCamp | NaN | NaN | NaN | LinkedIn Learning | NaN | University Courses (resulting in a university ... | NaN | NaN | Local development environments (RStudio, Jupyt... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
# create a dictionary for questions
Questions = {}
# create list of questions
# not very efficient, but keeps things ordered
qnums = list(dict.fromkeys([i.split('_')[0] for i in df_fin.columns])) # returns a list of column names
# add data for each question to key value pairs in dictionary
for i in qnums:
if i in ['Q1','Q2','Q3']: # since we are using .startswith() below this prevents all questions that start with
Questions[i] = df_fin[i] # [1,2,3] from going in the key value pair
else:
Questions[i] = df_fin[[q for q in df_fin.columns if q.startswith(i)]]
# create dictionary for different gender selections
Genders = {}
for i in df_fin.Q2.unique():
Genders[i] = df_fin[df_fin.Q2 == i]
# look at gender distribution
df_fin.Q2.value_counts() / df_fin.Q2.value_counts().sum()
Man 0.788032 Woman 0.193552 Prefer not to say 0.013126 Prefer to self-describe 0.002695 Nonbinary 0.002595 Name: Q2, dtype: float64
# filter dataframe for male & female for simplicity (not that prefer not & nonbinary aren't important!)
df_mf = df_fin[df_fin.Q2.isin(['Man','Woman'])]
# DS is clearly already a male dominated field (or at least this sample of kaggle users is)
df_mf.Q2.value_counts() / df_mf.Q2.value_counts().sum()
Man 0.802817 Woman 0.197183 Name: Q2, dtype: float64
# Female Distribution by Role
fig = px.histogram(df_mf, x='Q4', color='Q2')
fig.show()
This plot shows the number of men and women in each category(degree). It does not really tell us anything since we already know that this dataset is male dominated.
# Female Distribution by Role Normalized by sample of respective population
fig= px.histogram(df_mf, x='Q4',color='Q2', histnorm='probability density')
fig.show()
This plot shows the percentage of women that are in each category. Right off the bat we see that a lot of women have Master's degrees (42.7%) and this is closely followed by 34% in the Bachelor's degree category. Men have 39.7% and 36.2% in Master's and Bachelor's degree categories respectively.
# Do more women have a particular Level of Education than men
male_degrees = df_mf[df_mf.Q2 == 'Man'].Q4.value_counts(normalize=True)
female_degrees = df_mf[df_mf.Q2 == 'Woman'].Q4.value_counts(normalize=True)
# total_degrees = df_mf.Q4.value_counts()
more_women = female_degrees - male_degrees # greater proportion of women than men
color = np.where(more_women.values < 0, 'red', 'blue')
fig = go.Figure(go.Bar(x=more_women.index, y=more_women.values, marker_color=color))
fig.update_layout(title= "How much more/less women are there in proportion to men for each Level of Education")
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()
In this plot, we see that a higher percentage of women have Masters and Doctoral degrees when compared to men. Seeing that the percentage of women is less in Bachelor's degree, we can assume that women prefer to further their education after getting a Bachelor's degree.
Also, a lower percentage of women have no formal education past high school and also, a lower percentage had some level of college education but they did not finish.
# Percent more or less than distribution of the proportion of women
female_degrees = df_mf[df_mf.Q2 == 'Woman'].Q4.value_counts()
total_degrees = df_mf.Q4.value_counts()
more_women = (female_degrees / total_degrees) - .197 # greater proportion of women than sample proportion
color = np.where(more_women.values < 0, 'red', 'blue')
fig = go.Figure(go.Bar(x=more_women.index, y=more_women.values, marker_color=color))
fig.update_layout(title= "Level of Female Education Relative to Percentage of Sample women (19.7%)")
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()
This plot shows how much the proportion of women in each level of education is better/worse than the proportion of women in the sample. In an ideal scenario, the proportion of women for each level of education would be the same as the proportion of women in the sample but that is not the case.
The most obvious is the No formal education past high school category. Here, the proportion of women in this category is very small (about 13.6% less than the percentage of women in the sample, 19.7%). This is a good thing since it means that a higher percentage of women has some college education at least.
# Gender Distribution by Country
fig = px.histogram(df_mf, x='Q3', color ='Q2')
fig.update_xaxes(categoryorder= "total descending")
fig.show()
# Percent more or less than distribution of the proportion of women
female_country = df_mf[df_mf.Q2 == 'Woman'].Q3.value_counts()
total_country = df_mf.Q3.value_counts()
more_women = (female_country / total_country) - .197 # greater proportion of women than sample
color = np.where(more_women.values < 0, 'red', 'blue')
fig = go.Figure(go.Bar(x=more_women.index, y=more_women.values, marker_color=color))
fig.update_layout(title= "Amount of Women By Country Relative to Percentage of Sample women (19.7%)")
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()
Malaysia has the highest proportion of women and is 19.5% more than the proportion of women in the sample. Japan is the country has the lowest proportion of women in this sample.
# function for creating new plots without having to repeat all that code
def create_norm_graph(qnum, data, title, baseline):
'''
This function creates interactive plots using plotly graph objects.
The function makes a bar plot that shows how much a category performed relative to a baseline provided.
Args:
qnum: column from the dataframe
data: the dataframe
title: title for the plot
baseline: baseline to compare values of the column with
Returns:
The bar plot showing proportions relative to a baseline
'''
male = data[data.Q2 == 'Man'][qnum].value_counts()
female = data[data.Q2 == 'Woman'][qnum].value_counts()
total = data[qnum].value_counts()
more_women = (female / total) - baseline # greater proportion of women than sample
color = np.where(more_women.values < 0, 'red', 'blue')
fig = go.Figure(go.Bar(x=more_women.index, y=more_women.values, marker_color=color))
fig.update_layout(title= title)
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()
return
# which countries have the most relative female representitives in the survey?
create_norm_graph('Q3', df_mf, "Amount of Women By Country Relative to Percentage of Sample women (19.7%)", .197)
# Which roles have the most women relative to the baseline?
create_norm_graph('Q5', df_mf, "Amount of Women By Role Relative to Percentage of Sample women (19.7%)", .197)
Surprisingly (for me at least!), Product/Project Managers have a very low proportion of women and Statisticians have a pretty high proportion of women (probably as a result of all those degrees they bagged), this theory is supported by the fact that a higher proportion of women are students.
Notice how the roles at the tail end of the plot seem like the higher paying roles.
# create new baseline for only employed people since we want to look at earning potential
df_workers_mf = df_mf[~df_mf['Q5'].isin(['Student', 'Currently not employed'])]
df_workers_mf.Q2.value_counts() / df_workers_mf.Q2.value_counts().sum()
Man 0.825956 Woman 0.174044 Name: Q2, dtype: float64
# Women's experience
create_norm_graph('Q6', df_workers_mf, "Amount of Women By Experience Relative to Percentage of Sample women (17.4%)", .174)
A higher proportion of women are in the early stages of their coding career, the proportion of women keeps decreasing as the years of coding experience increases. Since experience is a factor when talking about earning potential, this should be something to keep in mind.
# by income level
create_norm_graph('Q24', df_workers_mf, "Amount of Women By Income Level Relative to Percentage of Sample women (17.4%)", .174)
From the plot above, as predicted, women are typically higher in proportion than the average sample of women in lower income ranges.
# graph for just data scientists
df_mf_ds = df_mf[df_mf['Q5'] =='Data Scientist']
create_norm_graph('Q24', df_mf_ds,
"Amount of Female Data Scientists By Income Level Relative to Percentage of Sample women (17.4%)", .174)
Same as the previous plot, the higher salary ranges typically have a lower proportion of women compared to the average proportion. Although there are a few higher income levels that are on the left side of the plot, there are not enough instances in the dataset to say that it is a common trend.
# count for perspective, some sample size issues here
df_mf_ds.Q24.value_counts()
$0-999 439 100,000-124,999 163 10,000-14,999 136 40,000-49,999 130 30,000-39,999 119 150,000-199,999 108 125,000-149,999 106 50,000-59,999 103 1,000-1,999 103 70,000-79,999 99 15,000-19,999 95 60,000-69,999 87 20,000-24,999 86 90,000-99,999 84 25,000-29,999 84 5,000-7,499 75 80,000-89,999 75 7,500-9,999 64 2,000-2,999 45 3,000-3,999 44 4,000-4,999 43 200,000-249,999 38 300,000-500,000 16 > $500,000 12 250,000-299,999 8 Name: Q24, dtype: int64
# graph for US
df_mf_US = df_mf[df_mf['Q3'] == 'United States of America']
create_norm_graph('Q24', df_mf_US, "Amount of Women in the US Relative to Percentage of Sample women (17.4%)", .174)
df_mf_US.Q24.value_counts()
100,000-124,999 244 150,000-199,999 225 125,000-149,999 179 90,000-99,999 110 $0-999 99 70,000-79,999 90 200,000-249,999 84 80,000-89,999 82 60,000-69,999 58 50,000-59,999 47 40,000-49,999 43 300,000-500,000 36 250,000-299,999 30 30,000-39,999 25 10,000-14,999 15 25,000-29,999 12 15,000-19,999 12 > $500,000 11 20,000-24,999 11 1,000-1,999 9 5,000-7,499 6 2,000-2,999 4 7,500-9,999 3 3,000-3,999 3 4,000-4,999 2 Name: Q24, dtype: int64
# Income by roles. Filtering for the top salary ranges.
condition = df_fin['Q24'].isin(['200,000-249,999', '250,000-299,999', '300,000-500,000', '> $500,000'])
fig = px.histogram(df_fin.dropna(subset=['Q24','Q5'])[condition], x='Q24', color ='Q5')
fig.update_xaxes(categoryorder="total descending")
fig.update_layout(title='Distribution of Roles among the top salary ranges')
fig.show()
<ipython-input-25-73f30e96e730>:4: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
We see that Data Scientists, Software Engineers and Research Scientists contribute a significant amount in these salary ranges.
# Income by years of coding experience. Filtering for the top salary ranges.
condition = df_fin['Q24'].isin(['200,000-249,999', '250,000-299,999', '300,000-500,000', '> $500,000'])
fig = px.histogram(df_fin.dropna(subset=['Q24','Q6'])[condition], x='Q24', color='Q6')
fig.update_xaxes(categoryorder="total descending")
fig.update_layout(title='Distribution of Years of coding experience among the top salary ranges')
fig.show()
<ipython-input-26-4db175ae7a1d>:4: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
As expected longer years of coding experience dominates these salary ranges.
# Income by level of education. Filtering for the top salary ranges.
condition = df_fin['Q24'].isin(['200,000-249,999', '250,000-299,999', '300,000-500,000', '> $500,000'])
fig = px.histogram(df_fin.dropna(subset=['Q24','Q4'])[condition], x='Q24', color ='Q4')
fig.update_xaxes(categoryorder="total descending")
fig.update_layout(title='Distribution of Levels of Education among the top salary ranges')
fig.show()
<ipython-input-27-eddb7ae91618>:4: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
As expected, Degree holders are found more in these top salary ranges.
I thought it made more sense to use a regression here to try to predict salary. Although it will be very rough around the edges, I think converting the salaries from categorical to numeric will allow us to more easily interperet the data.
# replace '$',',','>' in data
df_model = df_fin.dropna(subset=['Q24'])
df_model['salary_cleaned'] = df_model.Q24.apply(lambda x: str(x).replace('$', '').replace(',', '').replace('>', '').strip())
df_model.salary_cleaned.value_counts()
<ipython-input-28-a6a7a913391f>:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
0-999 2128 10000-14999 665 1000-1999 581 100000-124999 573 40000-49999 552 30000-39999 540 50000-59999 510 5000-7499 488 15000-19999 449 60000-69999 408 20000-24999 404 70000-79999 394 7500-9999 371 150000-199999 347 2000-2999 330 125000-149999 315 25000-29999 310 90000-99999 280 4000-4999 279 80000-89999 273 3000-3999 264 200000-249999 115 300000-500000 55 500000 50 250000-299999 48 Name: salary_cleaned, dtype: int64
# create min range and max range for salary
df_model['salary_min'] = df_model.salary_cleaned.apply(lambda x: 500000 if '-' not in x else int(x.split('-')[0]))
df_model['salary_max'] = df_model.salary_cleaned.apply(lambda x: 500000 if '-' not in x else int(x.split('-')[1]))
df_model.salary_max.value_counts()
<ipython-input-29-3f085ad11601>:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy <ipython-input-29-3f085ad11601>:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
999 2128 14999 665 1999 581 124999 573 49999 552 39999 540 59999 510 7499 488 19999 449 69999 408 24999 404 79999 394 9999 371 199999 347 2999 330 149999 315 29999 310 99999 280 4999 279 89999 273 3999 264 249999 115 500000 105 299999 48 Name: salary_max, dtype: int64
# Convert to rough continuous variable
df_model['aprox_salary'] = (df_model.salary_min + df_model.salary_max) / 2
df_model.aprox_salary.value_counts()
<ipython-input-30-78a955329e39>:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
499.5 2128 12499.5 665 1499.5 581 112499.5 573 44999.5 552 34999.5 540 54999.5 510 6249.5 488 17499.5 449 64999.5 408 22499.5 404 74999.5 394 8749.5 371 174999.5 347 2499.5 330 137499.5 315 27499.5 310 94999.5 280 4499.5 279 84999.5 273 3499.5 264 224999.5 115 400000.0 55 500000.0 50 274999.5 48 Name: aprox_salary, dtype: int64
# simple linear regression just gender
import statsmodels.api as sm
# filter for men & women
df_model_fin = df_model[df_model.Q2.isin(['Man', 'Woman'])]
# filter for workers
df_model_fin = df_model_fin[~df_model_fin['Q5'].isin(['Student', 'Currently not employed'])]
df_model_fin.drop('Time from Start to Finish (seconds)', axis=1, inplace=True)
# check for null values in columns
df_model_fin.isnull().any()
Q1 False Q2 False Q3 False Q4 False Q5 False Q6 False Q7_Part_1 True Q7_Part_2 True Q7_Part_3 True Q7_Part_4 True Q7_Part_5 True Q7_Part_6 True Q7_Part_7 True Q7_Part_8 True Q7_Part_9 True Q7_Part_10 True Q7_Part_11 True Q7_Part_12 True Q7_OTHER True Q8 True Q9_Part_1 True Q9_Part_2 True Q9_Part_3 True Q9_Part_4 True Q9_Part_5 True Q9_Part_6 True Q9_Part_7 True Q9_Part_8 True Q9_Part_9 True Q9_Part_10 True Q9_Part_11 True Q9_OTHER True Q10_Part_1 True Q10_Part_2 True Q10_Part_3 True Q10_Part_4 True Q10_Part_5 True Q10_Part_6 True Q10_Part_7 True Q10_Part_8 True Q10_Part_9 True Q10_Part_10 True Q10_Part_11 True Q10_Part_12 True Q10_Part_13 True Q10_OTHER True Q11 True Q12_Part_1 True Q12_Part_2 True Q12_Part_3 True Q12_OTHER True Q13 True Q14_Part_1 True Q14_Part_2 True Q14_Part_3 True Q14_Part_4 True Q14_Part_5 True Q14_Part_6 True Q14_Part_7 True Q14_Part_8 True Q14_Part_9 True Q14_Part_10 True Q14_Part_11 True Q14_OTHER True Q15 True Q16_Part_1 True Q16_Part_2 True Q16_Part_3 True Q16_Part_4 True Q16_Part_5 True Q16_Part_6 True Q16_Part_7 True Q16_Part_8 True Q16_Part_9 True Q16_Part_10 True Q16_Part_11 True Q16_Part_12 True Q16_Part_13 True Q16_Part_14 True Q16_Part_15 True Q16_OTHER True Q17_Part_1 True Q17_Part_2 True Q17_Part_3 True Q17_Part_4 True Q17_Part_5 True Q17_Part_6 True Q17_Part_7 True Q17_Part_8 True Q17_Part_9 True Q17_Part_10 True Q17_Part_11 True Q17_OTHER True Q18_Part_1 True Q18_Part_2 True Q18_Part_3 True Q18_Part_4 True Q18_Part_5 True Q18_Part_6 True Q18_OTHER True Q19_Part_1 True Q19_Part_2 True Q19_Part_3 True Q19_Part_4 True Q19_Part_5 True Q19_OTHER True Q20 False Q21 False Q22 False Q23_Part_1 True Q23_Part_2 True Q23_Part_3 True Q23_Part_4 True Q23_Part_5 True Q23_Part_6 True Q23_Part_7 True Q23_OTHER True Q24 False Q25 True Q26_A_Part_1 True Q26_A_Part_2 True Q26_A_Part_3 True Q26_A_Part_4 True Q26_A_Part_5 True Q26_A_Part_6 True Q26_A_Part_7 True Q26_A_Part_8 True Q26_A_Part_9 True Q26_A_Part_10 True Q26_A_Part_11 True Q26_A_OTHER True Q27_A_Part_1 True Q27_A_Part_2 True Q27_A_Part_3 True Q27_A_Part_4 True Q27_A_Part_5 True Q27_A_Part_6 True Q27_A_Part_7 True Q27_A_Part_8 True Q27_A_Part_9 True Q27_A_Part_10 True Q27_A_Part_11 True Q27_A_OTHER True Q28_A_Part_1 True Q28_A_Part_2 True Q28_A_Part_3 True Q28_A_Part_4 True Q28_A_Part_5 True Q28_A_Part_6 True Q28_A_Part_7 True Q28_A_Part_8 True Q28_A_Part_9 True Q28_A_Part_10 True Q28_A_OTHER True Q29_A_Part_1 True Q29_A_Part_2 True Q29_A_Part_3 True Q29_A_Part_4 True Q29_A_Part_5 True Q29_A_Part_6 True Q29_A_Part_7 True Q29_A_Part_8 True Q29_A_Part_9 True Q29_A_Part_10 True Q29_A_Part_11 True Q29_A_Part_12 True Q29_A_Part_13 True Q29_A_Part_14 True Q29_A_Part_15 True Q29_A_Part_16 True Q29_A_Part_17 True Q29_A_OTHER True Q30 True Q31_A_Part_1 True Q31_A_Part_2 True Q31_A_Part_3 True Q31_A_Part_4 True Q31_A_Part_5 True Q31_A_Part_6 True Q31_A_Part_7 True Q31_A_Part_8 True Q31_A_Part_9 True Q31_A_Part_10 True Q31_A_Part_11 True Q31_A_Part_12 True Q31_A_Part_13 True Q31_A_Part_14 True Q31_A_OTHER True Q32 True Q33_A_Part_1 True Q33_A_Part_2 True Q33_A_Part_3 True Q33_A_Part_4 True Q33_A_Part_5 True Q33_A_Part_6 True Q33_A_Part_7 True Q33_A_OTHER True Q34_A_Part_1 True Q34_A_Part_2 True Q34_A_Part_3 True Q34_A_Part_4 True Q34_A_Part_5 True Q34_A_Part_6 True Q34_A_Part_7 True Q34_A_Part_8 True Q34_A_Part_9 True Q34_A_Part_10 True Q34_A_Part_11 True Q34_A_OTHER True Q35_A_Part_1 True Q35_A_Part_2 True Q35_A_Part_3 True Q35_A_Part_4 True Q35_A_Part_5 True Q35_A_Part_6 True Q35_A_Part_7 True Q35_A_Part_8 True Q35_A_Part_9 True Q35_A_Part_10 True Q35_A_OTHER True Q36_Part_1 True Q36_Part_2 True Q36_Part_3 True Q36_Part_4 True Q36_Part_5 True Q36_Part_6 True Q36_Part_7 True Q36_Part_8 True Q36_Part_9 True Q36_OTHER True Q37_Part_1 True Q37_Part_2 True Q37_Part_3 True Q37_Part_4 True Q37_Part_5 True Q37_Part_6 True Q37_Part_7 True Q37_Part_8 True Q37_Part_9 True Q37_Part_10 True Q37_Part_11 True Q37_OTHER True Q38 True Q39_Part_1 True Q39_Part_2 True Q39_Part_3 True Q39_Part_4 True Q39_Part_5 True Q39_Part_6 True Q39_Part_7 True Q39_Part_8 True Q39_Part_9 True Q39_Part_10 True Q39_Part_11 True Q39_OTHER True Q26_B_Part_1 True Q26_B_Part_2 True Q26_B_Part_3 True Q26_B_Part_4 True Q26_B_Part_5 True Q26_B_Part_6 True Q26_B_Part_7 True Q26_B_Part_8 True Q26_B_Part_9 True Q26_B_Part_10 True Q26_B_Part_11 True Q26_B_OTHER True Q27_B_Part_1 True Q27_B_Part_2 True Q27_B_Part_3 True Q27_B_Part_4 True Q27_B_Part_5 True Q27_B_Part_6 True Q27_B_Part_7 True Q27_B_Part_8 True Q27_B_Part_9 True Q27_B_Part_10 True Q27_B_Part_11 True Q27_B_OTHER True Q28_B_Part_1 True Q28_B_Part_2 True Q28_B_Part_3 True Q28_B_Part_4 True Q28_B_Part_5 True Q28_B_Part_6 True Q28_B_Part_7 True Q28_B_Part_8 True Q28_B_Part_9 True Q28_B_Part_10 True Q28_B_OTHER True Q29_B_Part_1 True Q29_B_Part_2 True Q29_B_Part_3 True Q29_B_Part_4 True Q29_B_Part_5 True Q29_B_Part_6 True Q29_B_Part_7 True Q29_B_Part_8 True Q29_B_Part_9 True Q29_B_Part_10 True Q29_B_Part_11 True Q29_B_Part_12 True Q29_B_Part_13 True Q29_B_Part_14 True Q29_B_Part_15 True Q29_B_Part_16 True Q29_B_Part_17 True Q29_B_OTHER True Q31_B_Part_1 True Q31_B_Part_2 True Q31_B_Part_3 True Q31_B_Part_4 True Q31_B_Part_5 True Q31_B_Part_6 True Q31_B_Part_7 True Q31_B_Part_8 True Q31_B_Part_9 True Q31_B_Part_10 True Q31_B_Part_11 True Q31_B_Part_12 True Q31_B_Part_13 True Q31_B_Part_14 True Q31_B_OTHER True Q33_B_Part_1 True Q33_B_Part_2 True Q33_B_Part_3 True Q33_B_Part_4 True Q33_B_Part_5 True Q33_B_Part_6 True Q33_B_Part_7 True Q33_B_OTHER True Q34_B_Part_1 True Q34_B_Part_2 True Q34_B_Part_3 True Q34_B_Part_4 True Q34_B_Part_5 True Q34_B_Part_6 True Q34_B_Part_7 True Q34_B_Part_8 True Q34_B_Part_9 True Q34_B_Part_10 True Q34_B_Part_11 True Q34_B_OTHER True Q35_B_Part_1 True Q35_B_Part_2 True Q35_B_Part_3 True Q35_B_Part_4 True Q35_B_Part_5 True Q35_B_Part_6 True Q35_B_Part_7 True Q35_B_Part_8 True Q35_B_Part_9 True Q35_B_Part_10 True Q35_B_OTHER True salary_cleaned False salary_min False salary_max False aprox_salary False dtype: bool
# create dummy variables, this is needed because essentially all our data is categorical
model_dummies = pd.get_dummies(df_model_fin)
model_dummies.head()
| salary_min | salary_max | aprox_salary | Q1_18-21 | Q1_22-24 | Q1_25-29 | Q1_30-34 | Q1_35-39 | Q1_40-44 | Q1_45-49 | Q1_50-54 | Q1_55-59 | Q1_60-69 | Q1_70+ | Q2_Man | Q2_Woman | Q3_Argentina | Q3_Australia | Q3_Bangladesh | Q3_Belarus | Q3_Belgium | Q3_Brazil | Q3_Canada | Q3_Chile | Q3_China | Q3_Colombia | Q3_Egypt | Q3_France | Q3_Germany | Q3_Ghana | Q3_Greece | Q3_India | Q3_Indonesia | Q3_Iran, Islamic Republic of... | Q3_Ireland | Q3_Israel | Q3_Italy | Q3_Japan | Q3_Kenya | Q3_Malaysia | Q3_Mexico | Q3_Morocco | Q3_Nepal | Q3_Netherlands | Q3_Nigeria | Q3_Other | Q3_Pakistan | Q3_Peru | Q3_Philippines | Q3_Poland | Q3_Portugal | Q3_Republic of Korea | Q3_Romania | Q3_Russia | Q3_Saudi Arabia | Q3_Singapore | Q3_South Africa | Q3_South Korea | Q3_Spain | Q3_Sri Lanka | Q3_Sweden | Q3_Switzerland | Q3_Taiwan | Q3_Thailand | Q3_Tunisia | Q3_Turkey | Q3_Ukraine | Q3_United Arab Emirates | Q3_United Kingdom of Great Britain and Northern Ireland | Q3_United States of America | Q3_Viet Nam | Q4_Bachelor’s degree | Q4_Doctoral degree | Q4_I prefer not to answer | Q4_Master’s degree | Q4_No formal education past high school | Q4_Professional degree | Q4_Some college/university study without earning a bachelor’s degree | Q5_Business Analyst | Q5_DBA/Database Engineer | Q5_Data Analyst | Q5_Data Engineer | Q5_Data Scientist | Q5_Machine Learning Engineer | Q5_Other | Q5_Product/Project Manager | Q5_Research Scientist | Q5_Software Engineer | Q5_Statistician | Q6_1-2 years | Q6_10-20 years | Q6_20+ years | Q6_3-5 years | Q6_5-10 years | Q6_< 1 years | Q6_I have never written code | Q7_Part_1_Python | Q7_Part_2_R | Q7_Part_3_SQL | Q7_Part_4_C | Q7_Part_5_C++ | Q7_Part_6_Java | Q7_Part_7_Javascript | Q7_Part_8_Julia | Q7_Part_9_Swift | Q7_Part_10_Bash | Q7_Part_11_MATLAB | Q7_Part_12_None | Q7_OTHER_Other | Q8_Bash | Q8_C | Q8_C++ | Q8_Java | Q8_Javascript | Q8_Julia | Q8_MATLAB | Q8_None | Q8_Other | Q8_Python | Q8_R | Q8_SQL | Q8_Swift | Q9_Part_1_Jupyter (JupyterLab, Jupyter Notebooks, etc) | Q9_Part_2_ RStudio | Q9_Part_3_Visual Studio | Q9_Part_4_Visual Studio Code (VSCode) | Q9_Part_5_ PyCharm | Q9_Part_6_ Spyder | Q9_Part_7_ Notepad++ | Q9_Part_8_ Sublime Text | Q9_Part_9_ Vim / Emacs | Q9_Part_10_ MATLAB | Q9_Part_11_None | Q9_OTHER_Other | Q10_Part_1_ Kaggle Notebooks | Q10_Part_2_Colab Notebooks | Q10_Part_3_Azure Notebooks | Q10_Part_4_ Paperspace / Gradient | Q10_Part_5_ Binder / JupyterHub | Q10_Part_6_ Code Ocean | Q10_Part_7_ IBM Watson Studio | Q10_Part_8_ Amazon Sagemaker Studio | Q10_Part_9_ Amazon EMR Notebooks | Q10_Part_10_Google Cloud AI Platform Notebooks | Q10_Part_11_Google Cloud Datalab Notebooks | Q10_Part_12_ Databricks Collaborative Notebooks | Q10_Part_13_None | Q10_OTHER_Other | Q11_A cloud computing platform (AWS, Azure, GCP, hosted notebooks, etc) | Q11_A deep learning workstation (NVIDIA GTX, LambdaLabs, etc) | Q11_A personal computer or laptop | Q11_None | Q11_Other | Q12_Part_1_GPUs | Q12_Part_2_TPUs | Q12_Part_3_None | Q12_OTHER_Other | Q13_2-5 times | Q13_6-25 times | Q13_More than 25 times | Q13_Never | Q13_Once | Q14_Part_1_ Matplotlib | Q14_Part_2_ Seaborn | Q14_Part_3_ Plotly / Plotly Express | Q14_Part_4_ Ggplot / ggplot2 | Q14_Part_5_ Shiny | Q14_Part_6_ D3 js | Q14_Part_7_ Altair | Q14_Part_8_ Bokeh | Q14_Part_9_ Geoplotlib | Q14_Part_10_ Leaflet / Folium | Q14_Part_11_None | Q14_OTHER_Other | Q15_1-2 years | Q15_10-20 years | Q15_2-3 years | Q15_20 or more years | Q15_3-4 years | Q15_4-5 years | Q15_5-10 years | Q15_I do not use machine learning methods | Q15_Under 1 year | Q16_Part_1_ Scikit-learn | Q16_Part_2_ TensorFlow | Q16_Part_3_ Keras | Q16_Part_4_ PyTorch | Q16_Part_5_ Fast.ai | Q16_Part_6_ MXNet | Q16_Part_7_ Xgboost | Q16_Part_8_ LightGBM | Q16_Part_9_ CatBoost | Q16_Part_10_ Prophet | Q16_Part_11_ H2O 3 | Q16_Part_12_ Caret | Q16_Part_13_ Tidymodels | Q16_Part_14_ JAX | Q16_Part_15_None | Q16_OTHER_Other | Q17_Part_1_Linear or Logistic Regression | Q17_Part_2_Decision Trees or Random Forests | Q17_Part_3_Gradient Boosting Machines (xgboost, lightgbm, etc) | Q17_Part_4_Bayesian Approaches | Q17_Part_5_Evolutionary Approaches | Q17_Part_6_Dense Neural Networks (MLPs, etc) | Q17_Part_7_Convolutional Neural Networks | Q17_Part_8_Generative Adversarial Networks | Q17_Part_9_Recurrent Neural Networks | Q17_Part_10_Transformer Networks (BERT, gpt-3, etc) | Q17_Part_11_None | Q17_OTHER_Other | Q18_Part_1_General purpose image/video tools (PIL, cv2, skimage, etc) | Q18_Part_2_Image segmentation methods (U-Net, Mask R-CNN, etc) | Q18_Part_3_Object detection methods (YOLOv3, RetinaNet, etc) | Q18_Part_4_Image classification and other general purpose networks (VGG, Inception, ResNet, ResNeXt, NASNet, EfficientNet, etc) | Q18_Part_5_Generative Networks (GAN, VAE, etc) | Q18_Part_6_None | Q18_OTHER_Other | Q19_Part_1_Word embeddings/vectors (GLoVe, fastText, word2vec) | Q19_Part_2_Encoder-decorder models (seq2seq, vanilla transformers) | Q19_Part_3_Contextualized embeddings (ELMo, CoVe) | Q19_Part_4_Transformer language models (GPT-3, BERT, XLnet, etc) | Q19_Part_5_None | Q19_OTHER_Other | Q20_0-49 employees | Q20_10,000 or more employees | Q20_1000-9,999 employees | Q20_250-999 employees | Q20_50-249 employees | Q21_0 | Q21_1-2 | Q21_10-14 | Q21_15-19 | Q21_20+ | Q21_3-4 | Q21_5-9 | Q22_I do not know | Q22_No (we do not use ML methods) | Q22_We are exploring ML methods (and may one day put a model into production) | Q22_We have well established ML methods (i.e., models in production for more than 2 years) | Q22_We recently started using ML methods (i.e., models in production for less than 2 years) | Q22_We use ML methods for generating insights (but do not put working models into production) | Q23_Part_1_Analyze and understand data to influence product or business decisions | Q23_Part_2_Build and/or run the data infrastructure that my business uses for storing, analyzing, and operationalizing data | Q23_Part_3_Build prototypes to explore applying machine learning to new areas | Q23_Part_4_Build and/or run a machine learning service that operationally improves my product or workflows | Q23_Part_5_Experimentation and iteration to improve existing ML models | Q23_Part_6_Do research that advances the state of the art of machine learning | Q23_Part_7_None of these activities are an important part of my role at work | Q23_OTHER_Other | ... | Q29_A_Part_10_Microsoft Azure Data Lake Storage | Q29_A_Part_11_Amazon Redshift | Q29_A_Part_12_Amazon Athena | Q29_A_Part_13_Amazon DynamoDB | Q29_A_Part_14_Google Cloud BigQuery | Q29_A_Part_15_Google Cloud SQL | Q29_A_Part_16_Google Cloud Firestore | Q29_A_Part_17_None | Q29_A_OTHER_Other | Q30_Amazon Athena | Q30_Amazon DynamoDB | Q30_Amazon Redshift | Q30_Google Cloud BigQuery | Q30_Google Cloud Firestore | Q30_Google Cloud SQL | Q30_IBM Db2 | Q30_Microsoft Access | Q30_Microsoft Azure Data Lake Storage | Q30_Microsoft SQL Server | Q30_MongoDB | Q30_MySQL | Q30_Oracle Database | Q30_Other | Q30_PostgresSQL | Q30_SQLite | Q30_Snowflake | Q31_A_Part_1_Amazon QuickSight | Q31_A_Part_2_Microsoft Power BI | Q31_A_Part_3_Google Data Studio | Q31_A_Part_4_Looker | Q31_A_Part_5_Tableau | Q31_A_Part_6_Salesforce | Q31_A_Part_7_Einstein Analytics | Q31_A_Part_8_Qlik | Q31_A_Part_9_Domo | Q31_A_Part_10_TIBCO Spotfire | Q31_A_Part_11_Alteryx | Q31_A_Part_12_Sisense | Q31_A_Part_13_SAP Analytics Cloud | Q31_A_Part_14_None | Q31_A_OTHER_Other | Q32_Alteryx | Q32_Amazon QuickSight | Q32_Domo | Q32_Einstein Analytics | Q32_Google Data Studio | Q32_Looker | Q32_Microsoft Power BI | Q32_Other | Q32_Qlik | Q32_SAP Analytics Cloud | Q32_Salesforce | Q32_Sisense | Q32_TIBCO Spotfire | Q32_Tableau | Q33_A_Part_1_Automated data augmentation (e.g. imgaug, albumentations) | Q33_A_Part_2_Automated feature engineering/selection (e.g. tpot, boruta_py) | Q33_A_Part_3_Automated model selection (e.g. auto-sklearn, xcessiv) | Q33_A_Part_4_Automated model architecture searches (e.g. darts, enas) | Q33_A_Part_5_Automated hyperparameter tuning (e.g. hyperopt, ray.tune, Vizier) | Q33_A_Part_6_Automation of full ML pipelines (e.g. Google AutoML, H20 Driverless AI) | Q33_A_Part_7_No / None | Q33_A_OTHER_Other | Q34_A_Part_1_ Google Cloud AutoML | Q34_A_Part_2_ H20 Driverless AI | Q34_A_Part_3_ Databricks AutoML | Q34_A_Part_4_ DataRobot AutoML | Q34_A_Part_5_ Tpot | Q34_A_Part_6_ Auto-Keras | Q34_A_Part_7_ Auto-Sklearn | Q34_A_Part_8_ Auto_ml | Q34_A_Part_9_ Xcessiv | Q34_A_Part_10_ MLbox | Q34_A_Part_11_No / None | Q34_A_OTHER_Other | Q35_A_Part_1_ Neptune.ai | Q35_A_Part_2_ Weights & Biases | Q35_A_Part_3_ Comet.ml | Q35_A_Part_4_ Sacred + Omniboard | Q35_A_Part_5_ TensorBoard | Q35_A_Part_6_ Guild.ai | Q35_A_Part_7_ Polyaxon | Q35_A_Part_8_ Trains | Q35_A_Part_9_ Domino Model Monitor | Q35_A_Part_10_No / None | Q35_A_OTHER_Other | Q36_Part_1_ Plotly Dash | Q36_Part_2_ Streamlit | Q36_Part_3_ NBViewer | Q36_Part_4_ GitHub | Q36_Part_5_ Personal blog | Q36_Part_6_ Kaggle | Q36_Part_7_ Colab | Q36_Part_8_ Shiny | Q36_Part_9_I do not share my work publicly | Q36_OTHER_Other | Q37_Part_1_Coursera | Q37_Part_2_edX | Q37_Part_3_Kaggle Learn Courses | Q37_Part_4_DataCamp | Q37_Part_5_Fast.ai | Q37_Part_6_Udacity | Q37_Part_7_Udemy | Q37_Part_8_LinkedIn Learning | Q37_Part_9_Cloud-certification programs (direct from AWS, Azure, GCP, or similar) | Q37_Part_10_University Courses (resulting in a university degree) | Q37_Part_11_None | Q37_OTHER_Other | Q38_Advanced statistical software (SPSS, SAS, etc.) | Q38_Basic statistical software (Microsoft Excel, Google Sheets, etc.) | Q38_Business intelligence software (Salesforce, Tableau, Spotfire, etc.) | Q38_Cloud-based data software & APIs (AWS, GCP, Azure, etc.) | Q38_Local development environments (RStudio, JupyterLab, etc.) | Q38_Other | Q39_Part_1_Twitter (data science influencers) | Q39_Part_2_Email newsletters (Data Elixir, O'Reilly Data & AI, etc) | Q39_Part_3_Reddit (r/machinelearning, etc) | Q39_Part_4_Kaggle (notebooks, forums, etc) | Q39_Part_5_Course Forums (forums.fast.ai, Coursera forums, etc) | Q39_Part_6_YouTube (Kaggle YouTube, Cloud AI Adventures, etc) | Q39_Part_7_Podcasts (Chai Time Data Science, O’Reilly Data Show, etc) | Q39_Part_8_Blogs (Towards Data Science, Analytics Vidhya, etc) | Q39_Part_9_Journal Publications (peer-reviewed journals, conference proceedings, etc) | Q39_Part_10_Slack Communities (ods.ai, kagglenoobs, etc) | Q39_Part_11_None | Q39_OTHER_Other | Q26_B_Part_1_ Amazon Web Services (AWS) | Q26_B_Part_2_ Microsoft Azure | Q26_B_Part_3_ Google Cloud Platform (GCP) | Q26_B_Part_4_ IBM Cloud / Red Hat | Q26_B_Part_5_ Oracle Cloud | Q26_B_Part_6_ SAP Cloud | Q26_B_Part_7_ VMware Cloud | Q26_B_Part_8_ Salesforce Cloud | Q26_B_Part_9_ Alibaba Cloud | Q26_B_Part_10_ Tencent Cloud | Q26_B_Part_11_None | Q26_B_OTHER_Other | Q27_B_Part_1_ Amazon EC2 | Q27_B_Part_2_ AWS Lambda | Q27_B_Part_3_ Amazon Elastic Container Service | Q27_B_Part_4_ Azure Cloud Services | Q27_B_Part_5_ Microsoft Azure Container Instances | Q27_B_Part_6_ Azure Functions | Q27_B_Part_7_ Google Cloud Compute Engine | Q27_B_Part_8_ Google Cloud Functions | Q27_B_Part_9_ Google Cloud Run | Q27_B_Part_10_ Google Cloud App Engine | Q27_B_Part_11_None | Q27_B_OTHER_Other | Q28_B_Part_1_ Amazon SageMaker | Q28_B_Part_2_ Amazon Forecast | Q28_B_Part_3_ Amazon Rekognition | Q28_B_Part_4_ Azure Machine Learning Studio | Q28_B_Part_5_ Azure Cognitive Services | Q28_B_Part_6_ Google Cloud AI Platform / Google Cloud ML Engine | Q28_B_Part_7_ Google Cloud Video AI | Q28_B_Part_8_ Google Cloud Natural Language | Q28_B_Part_9_ Google Cloud Vision AI | Q28_B_Part_10_None | Q28_B_OTHER_Other | Q29_B_Part_1_MySQL | Q29_B_Part_2_PostgresSQL | Q29_B_Part_3_SQLite | Q29_B_Part_4_Oracle Database | Q29_B_Part_5_MongoDB | Q29_B_Part_6_Snowflake | Q29_B_Part_7_IBM Db2 | Q29_B_Part_8_Microsoft SQL Server | Q29_B_Part_9_Microsoft Access | Q29_B_Part_10_Microsoft Azure Data Lake Storage | Q29_B_Part_11_Amazon Redshift | Q29_B_Part_12_Amazon Athena | Q29_B_Part_13_Amazon DynamoDB | Q29_B_Part_14_Google Cloud BigQuery | Q29_B_Part_15_Google Cloud SQL | Q29_B_Part_16_Google Cloud Firestore | Q29_B_Part_17_None | Q29_B_OTHER_Other | Q31_B_Part_1_Microsoft Power BI | Q31_B_Part_2_Amazon QuickSight | Q31_B_Part_3_Google Data Studio | Q31_B_Part_4_Looker | Q31_B_Part_5_Tableau | Q31_B_Part_6_Salesforce | Q31_B_Part_7_Einstein Analytics | Q31_B_Part_8_Qlik | Q31_B_Part_9_Domo | Q31_B_Part_10_TIBCO Spotfire | Q31_B_Part_11_Alteryx | Q31_B_Part_12_Sisense | Q31_B_Part_13_SAP Analytics Cloud | Q31_B_Part_14_None | Q31_B_OTHER_Other | Q33_B_Part_1_Automated data augmentation (e.g. imgaug, albumentations) | Q33_B_Part_2_Automated feature engineering/selection (e.g. tpot, boruta_py) | Q33_B_Part_3_Automated model selection (e.g. auto-sklearn, xcessiv) | Q33_B_Part_4_Automated model architecture searches (e.g. darts, enas) | Q33_B_Part_5_Automated hyperparameter tuning (e.g. hyperopt, ray.tune, Vizier) | Q33_B_Part_6_Automation of full ML pipelines (e.g. Google Cloud AutoML, H20 Driverless AI) | Q33_B_Part_7_None | Q33_B_OTHER_Other | Q34_B_Part_1_ Google Cloud AutoML | Q34_B_Part_2_ H20 Driverless AI | Q34_B_Part_3_ Databricks AutoML | Q34_B_Part_4_ DataRobot AutoML | Q34_B_Part_5_ Tpot | Q34_B_Part_6_ Auto-Keras | Q34_B_Part_7_ Auto-Sklearn | Q34_B_Part_8_ Auto_ml | Q34_B_Part_9_ Xcessiv | Q34_B_Part_10_ MLbox | Q34_B_Part_11_None | Q34_B_OTHER_Other | Q35_B_Part_1_ Neptune.ai | Q35_B_Part_2_ Weights & Biases | Q35_B_Part_3_ Comet.ml | Q35_B_Part_4_ Sacred + Omniboard | Q35_B_Part_5_ TensorBoard | Q35_B_Part_6_ Guild.ai | Q35_B_Part_7_ Polyaxon | Q35_B_Part_8_ Trains | Q35_B_Part_9_ Domino Model Monitor | Q35_B_Part_10_None | Q35_B_OTHER_Other | salary_cleaned_0-999 | salary_cleaned_1000-1999 | salary_cleaned_10000-14999 | salary_cleaned_100000-124999 | salary_cleaned_125000-149999 | salary_cleaned_15000-19999 | salary_cleaned_150000-199999 | salary_cleaned_2000-2999 | salary_cleaned_20000-24999 | salary_cleaned_200000-249999 | salary_cleaned_25000-29999 | salary_cleaned_250000-299999 | salary_cleaned_3000-3999 | salary_cleaned_30000-39999 | salary_cleaned_300000-500000 | salary_cleaned_4000-4999 | salary_cleaned_40000-49999 | salary_cleaned_5000-7499 | salary_cleaned_50000-59999 | salary_cleaned_500000 | salary_cleaned_60000-69999 | salary_cleaned_70000-79999 | salary_cleaned_7500-9999 | salary_cleaned_80000-89999 | salary_cleaned_90000-99999 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2 | 100000 | 124999 | 112499.5 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 15000 | 19999 | 17499.5 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 125000 | 149999 | 137499.5 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 9 | 70000 | 79999 | 74999.5 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 12 | 30000 | 39999 | 34999.5 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 575 columns
I did not use the drop_first argument of pd.get_dummies because of the questions that are split into multiple columns. These columns get dropped altogether since there is only one value in the column (apart from null values). So I have to drop them manually each time I want to use specific columns.
# We only need one gender in this case because we trimmed it to only have Men & Women
Y = model_dummies.aprox_salary
X = model_dummies.Q2_Woman
# for statsmodels, we need to add a constant to create intercept
X = sm.add_constant(X)
# fit model with data
model = sm.OLS(Y, X)
results = model.fit()
# create summary report (watch video to see interpretation)
results.summary()
| Dep. Variable: | aprox_salary | R-squared: | 0.006 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.006 |
| Method: | Least Squares | F-statistic: | 59.46 |
| Date: | Wed, 27 Jul 2022 | Prob (F-statistic): | 1.36e-14 |
| Time: | 15:54:53 | Log-Likelihood: | -1.3189e+05 |
| No. Observations: | 10555 | AIC: | 2.638e+05 |
| Df Residuals: | 10553 | BIC: | 2.638e+05 |
| Df Model: | 1 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| const | 4.63e+04 | 686.389 | 67.452 | 0.000 | 4.5e+04 | 4.76e+04 |
| Q2_Woman | -1.326e+04 | 1718.927 | -7.711 | 0.000 | -1.66e+04 | -9885.702 |
| Omnibus: | 7193.154 | Durbin-Watson: | 1.978 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 111688.022 |
| Skew: | 3.122 | Prob(JB): | 0.00 |
| Kurtosis: | 17.662 | Cond. No. | 2.81 |
# create function to add additional questions to dataframe for easier processing
def qnums(question_list, dataframe):
'''
This function selects the dummy columns from the dummy dataframe for the related questions
passed into the question_list argument.
Args:
question_list: columns that you want their dummy representatives.
dataframe: the dataframe that contains dummy variables for all columns.
Returns:
A dataframe containing dummy representatives for selected questions.
'''
q_out = []
for i in question_list:
for j in dataframe.columns:
if i == j.split('_')[0]:
q_out.append(j)
return dataframe.loc[:, q_out]
# create data for questions 2, 4 and 5
q245 = qnums(['Q2','Q4','Q5'], model_dummies)
q245.head()
| Q2_Man | Q2_Woman | Q4_Bachelor’s degree | Q4_Doctoral degree | Q4_I prefer not to answer | Q4_Master’s degree | Q4_No formal education past high school | Q4_Professional degree | Q4_Some college/university study without earning a bachelor’s degree | Q5_Business Analyst | Q5_DBA/Database Engineer | Q5_Data Analyst | Q5_Data Engineer | Q5_Data Scientist | Q5_Machine Learning Engineer | Q5_Other | Q5_Product/Project Manager | Q5_Research Scientist | Q5_Software Engineer | Q5_Statistician | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 9 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 12 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
# drop one of each of the columns, they are redundant
X = q245.drop(['Q2_Man', 'Q4_I prefer not to answer', 'Q5_Other'], axis=1)
X = sm.add_constant(X)
# build model with additional features education, gender, and role
model = sm.OLS(Y, X)
results = model.fit()
results.summary()
| Dep. Variable: | aprox_salary | R-squared: | 0.050 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.048 |
| Method: | Least Squares | F-statistic: | 32.36 |
| Date: | Wed, 27 Jul 2022 | Prob (F-statistic): | 3.50e-103 |
| Time: | 15:54:55 | Log-Likelihood: | -1.3165e+05 |
| No. Observations: | 10555 | AIC: | 2.633e+05 |
| Df Residuals: | 10537 | BIC: | 2.635e+05 |
| Df Model: | 17 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| const | 2.689e+04 | 5480.525 | 4.906 | 0.000 | 1.61e+04 | 3.76e+04 |
| Q2_Woman | -1.292e+04 | 1692.183 | -7.634 | 0.000 | -1.62e+04 | -9601.676 |
| Q4_Bachelor’s degree | 1.088e+04 | 5385.253 | 2.020 | 0.043 | 324.352 | 2.14e+04 |
| Q4_Doctoral degree | 4.388e+04 | 5530.348 | 7.935 | 0.000 | 3.3e+04 | 5.47e+04 |
| Q4_Master’s degree | 2.513e+04 | 5338.990 | 4.707 | 0.000 | 1.47e+04 | 3.56e+04 |
| Q4_No formal education past high school | 6736.3251 | 8115.798 | 0.830 | 0.407 | -9172.175 | 2.26e+04 |
| Q4_Professional degree | 1.653e+04 | 6025.209 | 2.744 | 0.006 | 4721.945 | 2.83e+04 |
| Q4_Some college/university study without earning a bachelor’s degree | 1.521e+04 | 6189.580 | 2.457 | 0.014 | 3073.799 | 2.73e+04 |
| Q5_Business Analyst | -5416.6046 | 2953.257 | -1.834 | 0.067 | -1.12e+04 | 372.338 |
| Q5_DBA/Database Engineer | 411.9450 | 6365.478 | 0.065 | 0.948 | -1.21e+04 | 1.29e+04 |
| Q5_Data Analyst | -1.509e+04 | 2436.229 | -6.192 | 0.000 | -1.99e+04 | -1.03e+04 |
| Q5_Data Engineer | 2381.2133 | 3697.997 | 0.644 | 0.520 | -4867.561 | 9629.987 |
| Q5_Data Scientist | 4727.5139 | 2102.637 | 2.248 | 0.025 | 605.948 | 8849.080 |
| Q5_Machine Learning Engineer | -9499.1658 | 2671.681 | -3.556 | 0.000 | -1.47e+04 | -4262.165 |
| Q5_Product/Project Manager | 1.477e+04 | 3102.542 | 4.761 | 0.000 | 8689.266 | 2.09e+04 |
| Q5_Research Scientist | -1.423e+04 | 2729.248 | -5.212 | 0.000 | -1.96e+04 | -8876.061 |
| Q5_Software Engineer | -5539.5680 | 2292.779 | -2.416 | 0.016 | -1e+04 | -1045.287 |
| Q5_Statistician | -1.415e+04 | 4364.134 | -3.243 | 0.001 | -2.27e+04 | -5596.983 |
| Omnibus: | 7260.254 | Durbin-Watson: | 1.980 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 119737.467 |
| Skew: | 3.138 | Prob(JB): | 0.00 |
| Kurtosis: | 18.260 | Cond. No. | 28.2 |
# questions 2, 4, 5 and 7 (add in programming languages)
q2457 = qnums(['Q2', 'Q4', 'Q5', 'Q7'], model_dummies)
q2457.head()
| Q2_Man | Q2_Woman | Q4_Bachelor’s degree | Q4_Doctoral degree | Q4_I prefer not to answer | Q4_Master’s degree | Q4_No formal education past high school | Q4_Professional degree | Q4_Some college/university study without earning a bachelor’s degree | Q5_Business Analyst | Q5_DBA/Database Engineer | Q5_Data Analyst | Q5_Data Engineer | Q5_Data Scientist | Q5_Machine Learning Engineer | Q5_Other | Q5_Product/Project Manager | Q5_Research Scientist | Q5_Software Engineer | Q5_Statistician | Q7_Part_1_Python | Q7_Part_2_R | Q7_Part_3_SQL | Q7_Part_4_C | Q7_Part_5_C++ | Q7_Part_6_Java | Q7_Part_7_Javascript | Q7_Part_8_Julia | Q7_Part_9_Swift | Q7_Part_10_Bash | Q7_Part_11_MATLAB | Q7_Part_12_None | Q7_OTHER_Other | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 4 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 9 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 12 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
X = q2457.drop(['Q2_Man', 'Q4_I prefer not to answer', 'Q5_Other', 'Q7_OTHER_Other'], axis=1)
X = sm.add_constant(X)
model = sm.OLS(Y, X)
results = model.fit()
results.summary()
| Dep. Variable: | aprox_salary | R-squared: | 0.081 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.078 |
| Method: | Least Squares | F-statistic: | 31.92 |
| Date: | Wed, 27 Jul 2022 | Prob (F-statistic): | 3.27e-168 |
| Time: | 15:54:56 | Log-Likelihood: | -1.3148e+05 |
| No. Observations: | 10555 | AIC: | 2.630e+05 |
| Df Residuals: | 10525 | BIC: | 2.632e+05 |
| Df Model: | 29 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| const | 2.648e+04 | 5515.996 | 4.801 | 0.000 | 1.57e+04 | 3.73e+04 |
| Q2_Woman | -1.136e+04 | 1678.207 | -6.768 | 0.000 | -1.46e+04 | -8068.294 |
| Q4_Bachelor’s degree | 9112.9305 | 5306.501 | 1.717 | 0.086 | -1288.817 | 1.95e+04 |
| Q4_Doctoral degree | 4.116e+04 | 5457.030 | 7.543 | 0.000 | 3.05e+04 | 5.19e+04 |
| Q4_Master’s degree | 2.194e+04 | 5264.659 | 4.168 | 0.000 | 1.16e+04 | 3.23e+04 |
| Q4_No formal education past high school | 4480.3109 | 7992.155 | 0.561 | 0.575 | -1.12e+04 | 2.01e+04 |
| Q4_Professional degree | 1.409e+04 | 5935.740 | 2.374 | 0.018 | 2453.310 | 2.57e+04 |
| Q4_Some college/university study without earning a bachelor’s degree | 1.356e+04 | 6094.636 | 2.226 | 0.026 | 1617.591 | 2.55e+04 |
| Q5_Business Analyst | -7794.0458 | 2924.883 | -2.665 | 0.008 | -1.35e+04 | -2060.721 |
| Q5_DBA/Database Engineer | -9304.2854 | 6313.022 | -1.474 | 0.141 | -2.17e+04 | 3070.434 |
| Q5_Data Analyst | -1.809e+04 | 2434.246 | -7.430 | 0.000 | -2.29e+04 | -1.33e+04 |
| Q5_Data Engineer | -4449.6035 | 3689.488 | -1.206 | 0.228 | -1.17e+04 | 2782.492 |
| Q5_Data Scientist | -402.2331 | 2127.984 | -0.189 | 0.850 | -4573.485 | 3769.019 |
| Q5_Machine Learning Engineer | -8624.4866 | 2674.788 | -3.224 | 0.001 | -1.39e+04 | -3381.396 |
| Q5_Product/Project Manager | 1.434e+04 | 3060.436 | 4.686 | 0.000 | 8340.668 | 2.03e+04 |
| Q5_Research Scientist | -1.288e+04 | 2715.954 | -4.744 | 0.000 | -1.82e+04 | -7559.700 |
| Q5_Software Engineer | -6526.1595 | 2377.008 | -2.746 | 0.006 | -1.12e+04 | -1866.774 |
| Q5_Statistician | -1.72e+04 | 4388.765 | -3.920 | 0.000 | -2.58e+04 | -8601.639 |
| Q7_Part_1_Python | -2089.8217 | 1672.978 | -1.249 | 0.212 | -5369.175 | 1189.532 |
| Q7_Part_2_R | 6332.3808 | 1487.724 | 4.256 | 0.000 | 3416.160 | 9248.602 |
| Q7_Part_3_SQL | 9799.2102 | 1340.259 | 7.311 | 0.000 | 7172.049 | 1.24e+04 |
| Q7_Part_4_C | -5592.6104 | 2049.608 | -2.729 | 0.006 | -9610.231 | -1574.990 |
| Q7_Part_5_C++ | -3761.5088 | 1908.684 | -1.971 | 0.049 | -7502.891 | -20.126 |
| Q7_Part_6_Java | -1154.8178 | 1814.272 | -0.637 | 0.524 | -4711.135 | 2401.500 |
| Q7_Part_7_Javascript | -4954.9170 | 1784.160 | -2.777 | 0.005 | -8452.209 | -1457.625 |
| Q7_Part_8_Julia | 1.924e+04 | 4870.333 | 3.950 | 0.000 | 9692.458 | 2.88e+04 |
| Q7_Part_9_Swift | 1.497e+04 | 5645.352 | 2.652 | 0.008 | 3903.772 | 2.6e+04 |
| Q7_Part_10_Bash | 2.451e+04 | 1897.962 | 12.913 | 0.000 | 2.08e+04 | 2.82e+04 |
| Q7_Part_11_MATLAB | -9398.2617 | 2095.059 | -4.486 | 0.000 | -1.35e+04 | -5291.548 |
| Q7_Part_12_None | 4859.3494 | 6388.702 | 0.761 | 0.447 | -7663.717 | 1.74e+04 |
| Omnibus: | 7389.009 | Durbin-Watson: | 1.989 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 130840.255 |
| Skew: | 3.190 | Prob(JB): | 0.00 |
| Kurtosis: | 19.025 | Cond. No. | 37.8 |
# questions 2, 3, 4, 5 and 7, added country (huge boost in model performance)
q24573 = qnums(['Q2','Q4','Q5','Q7','Q3'], model_dummies)
q24573.head()
| Q2_Man | Q2_Woman | Q4_Bachelor’s degree | Q4_Doctoral degree | Q4_I prefer not to answer | Q4_Master’s degree | Q4_No formal education past high school | Q4_Professional degree | Q4_Some college/university study without earning a bachelor’s degree | Q5_Business Analyst | Q5_DBA/Database Engineer | Q5_Data Analyst | Q5_Data Engineer | Q5_Data Scientist | Q5_Machine Learning Engineer | Q5_Other | Q5_Product/Project Manager | Q5_Research Scientist | Q5_Software Engineer | Q5_Statistician | Q7_Part_1_Python | Q7_Part_2_R | Q7_Part_3_SQL | Q7_Part_4_C | Q7_Part_5_C++ | Q7_Part_6_Java | Q7_Part_7_Javascript | Q7_Part_8_Julia | Q7_Part_9_Swift | Q7_Part_10_Bash | Q7_Part_11_MATLAB | Q7_Part_12_None | Q7_OTHER_Other | Q3_Argentina | Q3_Australia | Q3_Bangladesh | Q3_Belarus | Q3_Belgium | Q3_Brazil | Q3_Canada | Q3_Chile | Q3_China | Q3_Colombia | Q3_Egypt | Q3_France | Q3_Germany | Q3_Ghana | Q3_Greece | Q3_India | Q3_Indonesia | Q3_Iran, Islamic Republic of... | Q3_Ireland | Q3_Israel | Q3_Italy | Q3_Japan | Q3_Kenya | Q3_Malaysia | Q3_Mexico | Q3_Morocco | Q3_Nepal | Q3_Netherlands | Q3_Nigeria | Q3_Other | Q3_Pakistan | Q3_Peru | Q3_Philippines | Q3_Poland | Q3_Portugal | Q3_Republic of Korea | Q3_Romania | Q3_Russia | Q3_Saudi Arabia | Q3_Singapore | Q3_South Africa | Q3_South Korea | Q3_Spain | Q3_Sri Lanka | Q3_Sweden | Q3_Switzerland | Q3_Taiwan | Q3_Thailand | Q3_Tunisia | Q3_Turkey | Q3_Ukraine | Q3_United Arab Emirates | Q3_United Kingdom of Great Britain and Northern Ireland | Q3_United States of America | Q3_Viet Nam | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 3 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 9 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 12 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
X = q24573.drop(['Q2_Man', 'Q4_I prefer not to answer', 'Q5_Other', 'Q7_OTHER_Other', 'Q3_Other'], axis=1)
X = sm.add_constant(X)
model = sm.OLS(Y,X)
results= model.fit()
results.summary()
| Dep. Variable: | aprox_salary | R-squared: | 0.353 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.348 |
| Method: | Least Squares | F-statistic: | 68.92 |
| Date: | Wed, 27 Jul 2022 | Prob (F-statistic): | 0.00 |
| Time: | 15:54:58 | Log-Likelihood: | -1.2962e+05 |
| No. Observations: | 10555 | AIC: | 2.594e+05 |
| Df Residuals: | 10471 | BIC: | 2.600e+05 |
| Df Model: | 83 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| const | 2.303e+04 | 4971.296 | 4.633 | 0.000 | 1.33e+04 | 3.28e+04 |
| Q2_Woman | -1.346e+04 | 1423.798 | -9.453 | 0.000 | -1.63e+04 | -1.07e+04 |
| Q4_Bachelor’s degree | 1877.8259 | 4491.395 | 0.418 | 0.676 | -6926.165 | 1.07e+04 |
| Q4_Doctoral degree | 1.935e+04 | 4631.697 | 4.177 | 0.000 | 1.03e+04 | 2.84e+04 |
| Q4_Master’s degree | 8351.0922 | 4456.400 | 1.874 | 0.061 | -384.300 | 1.71e+04 |
| Q4_No formal education past high school | -2951.3930 | 6750.021 | -0.437 | 0.662 | -1.62e+04 | 1.03e+04 |
| Q4_Professional degree | 1.28e+04 | 5025.954 | 2.547 | 0.011 | 2950.481 | 2.27e+04 |
| Q4_Some college/university study without earning a bachelor’s degree | -1827.7110 | 5152.296 | -0.355 | 0.723 | -1.19e+04 | 8271.771 |
| Q5_Business Analyst | -1749.9121 | 2469.141 | -0.709 | 0.479 | -6589.898 | 3090.074 |
| Q5_DBA/Database Engineer | -1641.0576 | 5322.343 | -0.308 | 0.758 | -1.21e+04 | 8791.749 |
| Q5_Data Analyst | -1.166e+04 | 2057.328 | -5.668 | 0.000 | -1.57e+04 | -7628.561 |
| Q5_Data Engineer | -403.6505 | 3115.749 | -0.130 | 0.897 | -6511.112 | 5703.811 |
| Q5_Data Scientist | 4365.7392 | 1796.150 | 2.431 | 0.015 | 844.943 | 7886.535 |
| Q5_Machine Learning Engineer | 389.8339 | 2270.354 | 0.172 | 0.864 | -4060.493 | 4840.161 |
| Q5_Product/Project Manager | 1.405e+04 | 2585.103 | 5.437 | 0.000 | 8986.708 | 1.91e+04 |
| Q5_Research Scientist | -7161.9120 | 2297.496 | -3.117 | 0.002 | -1.17e+04 | -2658.381 |
| Q5_Software Engineer | -977.1483 | 2009.662 | -0.486 | 0.627 | -4916.469 | 2962.173 |
| Q5_Statistician | -6398.9408 | 3711.459 | -1.724 | 0.085 | -1.37e+04 | 876.226 |
| Q7_Part_1_Python | -186.4436 | 1418.933 | -0.131 | 0.895 | -2967.824 | 2594.936 |
| Q7_Part_2_R | 714.0109 | 1266.882 | 0.564 | 0.573 | -1769.318 | 3197.340 |
| Q7_Part_3_SQL | 4678.6395 | 1138.072 | 4.111 | 0.000 | 2447.801 | 6909.478 |
| Q7_Part_4_C | 1908.0340 | 1747.381 | 1.092 | 0.275 | -1517.165 | 5333.233 |
| Q7_Part_5_C++ | -1074.1109 | 1612.061 | -0.666 | 0.505 | -4234.058 | 2085.836 |
| Q7_Part_6_Java | 2649.4801 | 1538.619 | 1.722 | 0.085 | -366.505 | 5665.466 |
| Q7_Part_7_Javascript | -3155.3535 | 1512.138 | -2.087 | 0.037 | -6119.431 | -191.276 |
| Q7_Part_8_Julia | 1.117e+04 | 4112.684 | 2.715 | 0.007 | 3105.609 | 1.92e+04 |
| Q7_Part_9_Swift | 6519.2254 | 4762.432 | 1.369 | 0.171 | -2816.049 | 1.59e+04 |
| Q7_Part_10_Bash | 1.18e+04 | 1618.805 | 7.290 | 0.000 | 8627.637 | 1.5e+04 |
| Q7_Part_11_MATLAB | -5487.3901 | 1778.945 | -3.085 | 0.002 | -8974.461 | -2000.319 |
| Q7_Part_12_None | -1.224e+04 | 5398.276 | -2.267 | 0.023 | -2.28e+04 | -1654.161 |
| Q3_Argentina | -1.377e+04 | 6123.882 | -2.249 | 0.025 | -2.58e+04 | -1769.132 |
| Q3_Australia | 5.545e+04 | 4825.082 | 11.491 | 0.000 | 4.6e+04 | 6.49e+04 |
| Q3_Bangladesh | -2.429e+04 | 7956.870 | -3.052 | 0.002 | -3.99e+04 | -8691.286 |
| Q3_Belarus | -1.617e+04 | 8824.264 | -1.832 | 0.067 | -3.35e+04 | 1127.606 |
| Q3_Belgium | 2.498e+04 | 9055.882 | 2.758 | 0.006 | 7225.690 | 4.27e+04 |
| Q3_Brazil | -9097.0594 | 3120.326 | -2.915 | 0.004 | -1.52e+04 | -2980.625 |
| Q3_Canada | 4.142e+04 | 4188.497 | 9.890 | 0.000 | 3.32e+04 | 4.96e+04 |
| Q3_Chile | -7469.8282 | 6927.236 | -1.078 | 0.281 | -2.1e+04 | 6108.874 |
| Q3_China | 4801.5007 | 4539.215 | 1.058 | 0.290 | -4096.227 | 1.37e+04 |
| Q3_Colombia | -1.771e+04 | 5144.552 | -3.442 | 0.001 | -2.78e+04 | -7623.734 |
| Q3_Egypt | -1.774e+04 | 5811.043 | -3.052 | 0.002 | -2.91e+04 | -6346.460 |
| Q3_France | 1.136e+04 | 4224.719 | 2.688 | 0.007 | 3075.901 | 1.96e+04 |
| Q3_Germany | 3.797e+04 | 3823.698 | 9.930 | 0.000 | 3.05e+04 | 4.55e+04 |
| Q3_Ghana | 3025.6666 | 1.16e+04 | 0.261 | 0.794 | -1.97e+04 | 2.57e+04 |
| Q3_Greece | -8652.4807 | 6765.173 | -1.279 | 0.201 | -2.19e+04 | 4608.548 |
| Q3_India | -1.085e+04 | 2196.452 | -4.942 | 0.000 | -1.52e+04 | -6549.450 |
| Q3_Indonesia | -9925.2258 | 5120.181 | -1.938 | 0.053 | -2e+04 | 111.305 |
| Q3_Iran, Islamic Republic of... | -2.356e+04 | 6281.990 | -3.751 | 0.000 | -3.59e+04 | -1.12e+04 |
| Q3_Ireland | 2.449e+04 | 9185.606 | 2.667 | 0.008 | 6488.006 | 4.25e+04 |
| Q3_Israel | 7.696e+04 | 6912.318 | 11.134 | 0.000 | 6.34e+04 | 9.05e+04 |
| Q3_Italy | 5438.5370 | 4313.546 | 1.261 | 0.207 | -3016.836 | 1.39e+04 |
| Q3_Japan | 1.511e+04 | 3292.577 | 4.589 | 0.000 | 8655.411 | 2.16e+04 |
| Q3_Kenya | -1.918e+04 | 6486.710 | -2.956 | 0.003 | -3.19e+04 | -6462.313 |
| Q3_Malaysia | -4936.4603 | 7379.266 | -0.669 | 0.504 | -1.94e+04 | 9528.307 |
| Q3_Mexico | -8056.6910 | 4863.886 | -1.656 | 0.098 | -1.76e+04 | 1477.453 |
| Q3_Morocco | -2.376e+04 | 7058.710 | -3.366 | 0.001 | -3.76e+04 | -9919.708 |
| Q3_Nepal | -1.523e+04 | 1.19e+04 | -1.284 | 0.199 | -3.85e+04 | 8028.379 |
| Q3_Netherlands | 3.717e+04 | 5382.807 | 6.905 | 0.000 | 2.66e+04 | 4.77e+04 |
| Q3_Nigeria | -1.805e+04 | 3866.361 | -4.669 | 0.000 | -2.56e+04 | -1.05e+04 |
| Q3_Pakistan | -1.83e+04 | 5132.118 | -3.567 | 0.000 | -2.84e+04 | -8244.491 |
| Q3_Peru | -1.501e+04 | 6919.618 | -2.168 | 0.030 | -2.86e+04 | -1441.296 |
| Q3_Philippines | -1.091e+04 | 7728.146 | -1.411 | 0.158 | -2.61e+04 | 4241.768 |
| Q3_Poland | -1720.5819 | 5669.763 | -0.303 | 0.762 | -1.28e+04 | 9393.234 |
| Q3_Portugal | -1203.7407 | 5961.484 | -0.202 | 0.840 | -1.29e+04 | 1.05e+04 |
| Q3_Republic of Korea | 3295.7740 | 8311.136 | 0.397 | 0.692 | -1.3e+04 | 1.96e+04 |
| Q3_Romania | -8390.3897 | 8713.273 | -0.963 | 0.336 | -2.55e+04 | 8689.285 |
| Q3_Russia | -1.126e+04 | 3397.957 | -3.314 | 0.001 | -1.79e+04 | -4600.425 |
| Q3_Saudi Arabia | 3493.1809 | 7872.797 | 0.444 | 0.657 | -1.19e+04 | 1.89e+04 |
| Q3_Singapore | 2.755e+04 | 5928.575 | 4.647 | 0.000 | 1.59e+04 | 3.92e+04 |
| Q3_South Africa | 8659.0332 | 6189.325 | 1.399 | 0.162 | -3473.224 | 2.08e+04 |
| Q3_South Korea | 8805.0625 | 5936.789 | 1.483 | 0.138 | -2832.176 | 2.04e+04 |
| Q3_Spain | 1.065e+04 | 3917.096 | 2.720 | 0.007 | 2975.713 | 1.83e+04 |
| Q3_Sri Lanka | -1.717e+04 | 8716.519 | -1.970 | 0.049 | -3.43e+04 | -83.953 |
| Q3_Sweden | 2.569e+04 | 7371.012 | 3.485 | 0.000 | 1.12e+04 | 4.01e+04 |
| Q3_Switzerland | 8.518e+04 | 7804.316 | 10.914 | 0.000 | 6.99e+04 | 1e+05 |
| Q3_Taiwan | -7931.8019 | 5122.522 | -1.548 | 0.122 | -1.8e+04 | 2109.317 |
| Q3_Thailand | 1716.8745 | 6470.016 | 0.265 | 0.791 | -1.1e+04 | 1.44e+04 |
| Q3_Tunisia | -1.022e+04 | 7672.124 | -1.332 | 0.183 | -2.53e+04 | 4816.895 |
| Q3_Turkey | -1.581e+04 | 4528.834 | -3.491 | 0.000 | -2.47e+04 | -6933.438 |
| Q3_Ukraine | -1.218e+04 | 5297.113 | -2.299 | 0.022 | -2.26e+04 | -1792.397 |
| Q3_United Arab Emirates | 3.101e+04 | 8031.475 | 3.861 | 0.000 | 1.53e+04 | 4.68e+04 |
| Q3_United Kingdom of Great Britain and Northern Ireland | 4.806e+04 | 3401.737 | 14.128 | 0.000 | 4.14e+04 | 5.47e+04 |
| Q3_United States of America | 8.678e+04 | 2331.742 | 37.217 | 0.000 | 8.22e+04 | 9.14e+04 |
| Q3_Viet Nam | -1.713e+04 | 6065.842 | -2.823 | 0.005 | -2.9e+04 | -5236.170 |
| Omnibus: | 8936.415 | Durbin-Watson: | 1.969 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 362278.541 |
| Skew: | 3.879 | Prob(JB): | 0.00 |
| Kurtosis: | 30.632 | Cond. No. | 49.6 |
# questions 2, 3, 4, 5, 6, 7 and 20
q245736 = qnums(['Q2','Q4','Q5','Q7','Q3','Q6','Q20'], model_dummies)
X = q245736.drop(['Q2_Man', 'Q4_I prefer not to answer', 'Q5_Other',
'Q7_OTHER_Other', 'Q3_Other', 'Q6_I have never written code', 'Q20_50-249 employees'], axis=1)
X = sm.add_constant(X)
model2 = sm.OLS(Y, X)
results = model2.fit()
results.summary()
| Dep. Variable: | aprox_salary | R-squared: | 0.391 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.386 |
| Method: | Least Squares | F-statistic: | 72.37 |
| Date: | Wed, 27 Jul 2022 | Prob (F-statistic): | 0.00 |
| Time: | 15:54:58 | Log-Likelihood: | -1.2930e+05 |
| No. Observations: | 10555 | AIC: | 2.588e+05 |
| Df Residuals: | 10461 | BIC: | 2.595e+05 |
| Df Model: | 93 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| const | 1.998e+04 | 5227.841 | 3.821 | 0.000 | 9729.629 | 3.02e+04 |
| Q2_Woman | -9850.1672 | 1391.441 | -7.079 | 0.000 | -1.26e+04 | -7122.677 |
| Q4_Bachelor’s degree | 2358.8884 | 4362.365 | 0.541 | 0.589 | -6192.180 | 1.09e+04 |
| Q4_Doctoral degree | 1.066e+04 | 4514.067 | 2.362 | 0.018 | 1813.450 | 1.95e+04 |
| Q4_Master’s degree | 6061.2281 | 4330.350 | 1.400 | 0.162 | -2427.085 | 1.45e+04 |
| Q4_No formal education past high school | -4032.2904 | 6552.793 | -0.615 | 0.538 | -1.69e+04 | 8812.434 |
| Q4_Professional degree | 9471.6892 | 4882.542 | 1.940 | 0.052 | -99.024 | 1.9e+04 |
| Q4_Some college/university study without earning a bachelor’s degree | 379.7929 | 5003.560 | 0.076 | 0.939 | -9428.139 | 1.02e+04 |
| Q5_Business Analyst | -221.6800 | 2399.179 | -0.092 | 0.926 | -4924.530 | 4481.169 |
| Q5_DBA/Database Engineer | -7440.0709 | 5178.243 | -1.437 | 0.151 | -1.76e+04 | 2710.273 |
| Q5_Data Analyst | -8205.4418 | 2008.429 | -4.086 | 0.000 | -1.21e+04 | -4268.538 |
| Q5_Data Engineer | -3306.4966 | 3039.060 | -1.088 | 0.277 | -9263.635 | 2650.641 |
| Q5_Data Scientist | 4891.9299 | 1769.479 | 2.765 | 0.006 | 1423.414 | 8360.446 |
| Q5_Machine Learning Engineer | 2304.1423 | 2233.703 | 1.032 | 0.302 | -2074.341 | 6682.626 |
| Q5_Product/Project Manager | 1.132e+04 | 2514.215 | 4.503 | 0.000 | 6392.083 | 1.62e+04 |
| Q5_Research Scientist | -7513.2542 | 2247.037 | -3.344 | 0.001 | -1.19e+04 | -3108.632 |
| Q5_Software Engineer | -5280.5330 | 1995.056 | -2.647 | 0.008 | -9191.223 | -1369.843 |
| Q5_Statistician | -5492.2495 | 3612.777 | -1.520 | 0.128 | -1.26e+04 | 1589.482 |
| Q7_Part_1_Python | 311.0264 | 1574.945 | 0.197 | 0.843 | -2776.167 | 3398.220 |
| Q7_Part_2_R | -694.3749 | 1246.093 | -0.557 | 0.577 | -3136.954 | 1748.205 |
| Q7_Part_3_SQL | 1141.8799 | 1125.737 | 1.014 | 0.310 | -1064.778 | 3348.538 |
| Q7_Part_4_C | 13.6995 | 1703.929 | 0.008 | 0.994 | -3326.326 | 3353.725 |
| Q7_Part_5_C++ | -2100.2435 | 1569.196 | -1.338 | 0.181 | -5176.168 | 975.681 |
| Q7_Part_6_Java | 141.7489 | 1498.198 | 0.095 | 0.925 | -2795.005 | 3078.503 |
| Q7_Part_7_Javascript | -3366.5836 | 1479.874 | -2.275 | 0.023 | -6267.419 | -465.748 |
| Q7_Part_8_Julia | 9157.2964 | 3995.353 | 2.292 | 0.022 | 1325.642 | 1.7e+04 |
| Q7_Part_9_Swift | 5386.1976 | 4628.395 | 1.164 | 0.245 | -3686.340 | 1.45e+04 |
| Q7_Part_10_Bash | 7736.1222 | 1583.277 | 4.886 | 0.000 | 4632.596 | 1.08e+04 |
| Q7_Part_11_MATLAB | -5133.0776 | 1729.888 | -2.967 | 0.003 | -8523.988 | -1742.167 |
| Q7_Part_12_None | -9453.0678 | 5348.503 | -1.767 | 0.077 | -1.99e+04 | 1031.019 |
| Q3_Argentina | -1.859e+04 | 5948.935 | -3.124 | 0.002 | -3.02e+04 | -6926.095 |
| Q3_Australia | 4.951e+04 | 4692.555 | 10.550 | 0.000 | 4.03e+04 | 5.87e+04 |
| Q3_Bangladesh | -1.723e+04 | 7730.119 | -2.228 | 0.026 | -3.24e+04 | -2073.911 |
| Q3_Belarus | -1.584e+04 | 8565.637 | -1.849 | 0.064 | -3.26e+04 | 951.447 |
| Q3_Belgium | 1.693e+04 | 8797.272 | 1.924 | 0.054 | -316.344 | 3.42e+04 |
| Q3_Brazil | -1.339e+04 | 3036.307 | -4.409 | 0.000 | -1.93e+04 | -7436.254 |
| Q3_Canada | 3.663e+04 | 4070.898 | 8.999 | 0.000 | 2.87e+04 | 4.46e+04 |
| Q3_Chile | -8582.8955 | 6724.484 | -1.276 | 0.202 | -2.18e+04 | 4598.377 |
| Q3_China | 7905.5053 | 4412.585 | 1.792 | 0.073 | -744.003 | 1.66e+04 |
| Q3_Colombia | -1.863e+04 | 4996.331 | -3.728 | 0.000 | -2.84e+04 | -8832.320 |
| Q3_Egypt | -1.428e+04 | 5644.172 | -2.530 | 0.011 | -2.53e+04 | -3215.022 |
| Q3_France | 8280.5304 | 4107.988 | 2.016 | 0.044 | 228.091 | 1.63e+04 |
| Q3_Germany | 3.208e+04 | 3719.491 | 8.625 | 0.000 | 2.48e+04 | 3.94e+04 |
| Q3_Ghana | 9811.4420 | 1.13e+04 | 0.872 | 0.383 | -1.22e+04 | 3.19e+04 |
| Q3_Greece | -8373.2555 | 6569.016 | -1.275 | 0.202 | -2.12e+04 | 4503.269 |
| Q3_India | -1.112e+04 | 2156.875 | -5.155 | 0.000 | -1.53e+04 | -6891.507 |
| Q3_Indonesia | -4494.9637 | 4977.574 | -0.903 | 0.367 | -1.43e+04 | 5262.031 |
| Q3_Iran, Islamic Republic of... | -2.148e+04 | 6102.855 | -3.519 | 0.000 | -3.34e+04 | -9514.652 |
| Q3_Ireland | 2.055e+04 | 8920.917 | 2.303 | 0.021 | 3060.145 | 3.8e+04 |
| Q3_Israel | 7.095e+04 | 6718.985 | 10.560 | 0.000 | 5.78e+04 | 8.41e+04 |
| Q3_Italy | -1169.9848 | 4196.801 | -0.279 | 0.780 | -9396.515 | 7056.545 |
| Q3_Japan | 7281.8949 | 3218.303 | 2.263 | 0.024 | 973.408 | 1.36e+04 |
| Q3_Kenya | -1.281e+04 | 6303.713 | -2.032 | 0.042 | -2.52e+04 | -450.434 |
| Q3_Malaysia | -6959.8146 | 7166.815 | -0.971 | 0.332 | -2.1e+04 | 7088.509 |
| Q3_Mexico | -1.091e+04 | 4724.046 | -2.310 | 0.021 | -2.02e+04 | -1650.827 |
| Q3_Morocco | -1.404e+04 | 6866.604 | -2.044 | 0.041 | -2.75e+04 | -578.718 |
| Q3_Nepal | -6962.2559 | 1.15e+04 | -0.604 | 0.546 | -2.96e+04 | 1.56e+04 |
| Q3_Netherlands | 2.961e+04 | 5236.019 | 5.656 | 0.000 | 1.94e+04 | 3.99e+04 |
| Q3_Nigeria | -1.177e+04 | 3769.755 | -3.122 | 0.002 | -1.92e+04 | -4380.472 |
| Q3_Pakistan | -1.097e+04 | 4993.100 | -2.197 | 0.028 | -2.08e+04 | -1181.114 |
| Q3_Peru | -1.325e+04 | 6718.581 | -1.972 | 0.049 | -2.64e+04 | -81.368 |
| Q3_Philippines | -1.135e+04 | 7505.656 | -1.512 | 0.131 | -2.61e+04 | 3365.160 |
| Q3_Poland | -6248.3355 | 5509.457 | -1.134 | 0.257 | -1.7e+04 | 4551.251 |
| Q3_Portugal | -5687.7887 | 5796.105 | -0.981 | 0.326 | -1.7e+04 | 5673.683 |
| Q3_Republic of Korea | 5056.5521 | 8071.553 | 0.626 | 0.531 | -1.08e+04 | 2.09e+04 |
| Q3_Romania | -1.229e+04 | 8461.673 | -1.453 | 0.146 | -2.89e+04 | 4292.657 |
| Q3_Russia | -1.246e+04 | 3301.880 | -3.773 | 0.000 | -1.89e+04 | -5985.725 |
| Q3_Saudi Arabia | 2011.4624 | 7649.045 | 0.263 | 0.793 | -1.3e+04 | 1.7e+04 |
| Q3_Singapore | 2.16e+04 | 5765.514 | 3.746 | 0.000 | 1.03e+04 | 3.29e+04 |
| Q3_South Africa | 5698.3482 | 6009.788 | 0.948 | 0.343 | -6081.984 | 1.75e+04 |
| Q3_South Korea | 9149.8168 | 5762.790 | 1.588 | 0.112 | -2146.351 | 2.04e+04 |
| Q3_Spain | 5185.5609 | 3810.059 | 1.361 | 0.174 | -2282.882 | 1.27e+04 |
| Q3_Sri Lanka | -1.202e+04 | 8465.007 | -1.420 | 0.156 | -2.86e+04 | 4571.707 |
| Q3_Sweden | 1.636e+04 | 7165.799 | 2.283 | 0.022 | 2315.915 | 3.04e+04 |
| Q3_Switzerland | 7.905e+04 | 7581.736 | 10.427 | 0.000 | 6.42e+04 | 9.39e+04 |
| Q3_Taiwan | -8175.4263 | 4972.654 | -1.644 | 0.100 | -1.79e+04 | 1571.925 |
| Q3_Thailand | -192.3400 | 6283.489 | -0.031 | 0.976 | -1.25e+04 | 1.21e+04 |
| Q3_Tunisia | -3590.7725 | 7454.417 | -0.482 | 0.630 | -1.82e+04 | 1.1e+04 |
| Q3_Turkey | -1.412e+04 | 4397.217 | -3.212 | 0.001 | -2.27e+04 | -5504.602 |
| Q3_Ukraine | -1.084e+04 | 5143.358 | -2.107 | 0.035 | -2.09e+04 | -753.795 |
| Q3_United Arab Emirates | 3.132e+04 | 7801.773 | 4.014 | 0.000 | 1.6e+04 | 4.66e+04 |
| Q3_United Kingdom of Great Britain and Northern Ireland | 4.06e+04 | 3318.105 | 12.237 | 0.000 | 3.41e+04 | 4.71e+04 |
| Q3_United States of America | 7.818e+04 | 2296.443 | 34.045 | 0.000 | 7.37e+04 | 8.27e+04 |
| Q3_Viet Nam | -1.328e+04 | 5891.838 | -2.253 | 0.024 | -2.48e+04 | -1727.901 |
| Q6_1-2 years | -991.9096 | 2921.728 | -0.339 | 0.734 | -6719.053 | 4735.234 |
| Q6_10-20 years | 2.508e+04 | 3067.494 | 8.175 | 0.000 | 1.91e+04 | 3.11e+04 |
| Q6_20+ years | 3.311e+04 | 3186.038 | 10.392 | 0.000 | 2.69e+04 | 3.94e+04 |
| Q6_3-5 years | 1630.9287 | 2900.235 | 0.562 | 0.574 | -4054.084 | 7315.942 |
| Q6_5-10 years | 1.24e+04 | 2974.756 | 4.169 | 0.000 | 6569.521 | 1.82e+04 |
| Q6_< 1 years | -952.9977 | 3000.153 | -0.318 | 0.751 | -6833.870 | 4927.874 |
| Q20_0-49 employees | -7222.0868 | 1546.324 | -4.670 | 0.000 | -1.03e+04 | -4190.997 |
| Q20_10,000 or more employees | 1.171e+04 | 1740.130 | 6.727 | 0.000 | 8294.723 | 1.51e+04 |
| Q20_1000-9,999 employees | 5977.2049 | 1766.838 | 3.383 | 0.001 | 2513.865 | 9440.545 |
| Q20_250-999 employees | 3313.6578 | 1930.550 | 1.716 | 0.086 | -470.589 | 7097.905 |
| Omnibus: | 9292.806 | Durbin-Watson: | 1.977 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 433887.014 |
| Skew: | 4.076 | Prob(JB): | 0.00 |
| Kurtosis: | 33.334 | Cond. No. | 52.9 |
# fit model with lasso parameters, set alpha high enough to eliminate some variables
results_reg = model2.fit_regularized(L1_wt=1, alpha=5)
# L1_wt set to 1 means Lasso Regression, 0 means ridge regression. setting alpha above zero removes variables that are not as
# important. Higher alpha means more variables that are cut off. an alpha of zero is basically a Linear Regression.
final = sm.regression.linear_model.OLSResults(model2, results_reg.params, model2.normalized_cov_params)
print(final.summary())
OLS Regression Results
==============================================================================
Dep. Variable: aprox_salary R-squared: 0.391
Model: OLS Adj. R-squared: 0.385
Method: Least Squares F-statistic: 72.13
Date: Wed, 27 Jul 2022 Prob (F-statistic): 0.00
Time: 15:55:03 Log-Likelihood: -1.2931e+05
No. Observations: 10555 AIC: 2.588e+05
Df Residuals: 10461 BIC: 2.595e+05
Df Model: 93
Covariance Type: nonrobust
========================================================================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------------------------------------------------------
const 2.728e+04 5231.137 5.214 0.000 1.7e+04 3.75e+04
Q2_Woman -9926.8810 1392.319 -7.130 0.000 -1.27e+04 -7197.671
Q4_Bachelor’s degree 0 4365.116 0 1.000 -8556.460 8556.460
Q4_Doctoral degree 8611.7882 4516.913 1.907 0.057 -242.223 1.75e+04
Q4_Master’s degree 3941.9186 4333.081 0.910 0.363 -4551.746 1.24e+04
Q4_No formal education past high school -5660.8662 6556.924 -0.863 0.388 -1.85e+04 7191.957
Q4_Professional degree 7058.9360 4885.620 1.445 0.149 -2517.812 1.66e+04
Q4_Some college/university study without earning a bachelor’s degree -1644.8231 5006.714 -0.329 0.743 -1.15e+04 8169.292
Q5_Business Analyst 0 2400.692 0 1.000 -4705.815 4705.815
Q5_DBA/Database Engineer -6391.6307 5181.507 -1.234 0.217 -1.65e+04 3765.112
Q5_Data Analyst -7802.4291 2009.695 -3.882 0.000 -1.17e+04 -3863.043
Q5_Data Engineer -2694.7072 3040.976 -0.886 0.376 -8655.601 3266.187
Q5_Data Scientist 5237.0525 1770.594 2.958 0.003 1766.349 8707.756
Q5_Machine Learning Engineer 2832.5290 2235.111 1.267 0.205 -1548.715 7213.773
Q5_Product/Project Manager 1.183e+04 2515.800 4.701 0.000 6895.020 1.68e+04
Q5_Research Scientist -6960.5172 2248.454 -3.096 0.002 -1.14e+04 -2553.118
Q5_Software Engineer -4540.9729 1996.314 -2.275 0.023 -8454.129 -627.817
Q5_Statistician -4928.0490 3615.055 -1.363 0.173 -1.2e+04 2158.148
Q7_Part_1_Python 1979.7206 1575.938 1.256 0.209 -1109.419 5068.861
Q7_Part_2_R 0 1246.878 0 1.000 -2444.120 2444.120
Q7_Part_3_SQL 1323.3089 1126.446 1.175 0.240 -884.741 3531.359
Q7_Part_4_C 218.4105 1705.003 0.128 0.898 -3123.721 3560.542
Q7_Part_5_C++ -1988.5873 1570.186 -1.266 0.205 -5066.451 1089.276
Q7_Part_6_Java 170.9445 1499.143 0.114 0.909 -2767.661 3109.550
Q7_Part_7_Javascript -3373.5353 1480.807 -2.278 0.023 -6276.200 -470.871
Q7_Part_8_Julia 8813.2997 3997.872 2.204 0.028 976.708 1.66e+04
Q7_Part_9_Swift 5223.8990 4631.314 1.128 0.259 -3854.359 1.43e+04
Q7_Part_10_Bash 7706.9081 1584.276 4.865 0.000 4601.425 1.08e+04
Q7_Part_11_MATLAB -5103.2229 1730.979 -2.948 0.003 -8496.271 -1710.174
Q7_Part_12_None -7145.0805 5351.876 -1.335 0.182 -1.76e+04 3345.617
Q3_Argentina -2.049e+04 5952.685 -3.442 0.001 -3.22e+04 -8822.552
Q3_Australia 4.677e+04 4695.513 9.960 0.000 3.76e+04 5.6e+04
Q3_Bangladesh -1.848e+04 7734.993 -2.389 0.017 -3.36e+04 -3318.267
Q3_Belarus -1.693e+04 8571.038 -1.976 0.048 -3.37e+04 -133.945
Q3_Belgium 1.3e+04 8802.819 1.477 0.140 -4254.470 3.03e+04
Q3_Brazil -1.579e+04 3038.222 -5.199 0.000 -2.18e+04 -9839.196
Q3_Canada 3.381e+04 4073.465 8.301 0.000 2.58e+04 4.18e+04
Q3_Chile -1.013e+04 6728.724 -1.506 0.132 -2.33e+04 3057.320
Q3_China 0 4415.367 0 1.000 -8654.962 8654.962
Q3_Colombia -2.085e+04 4999.481 -4.170 0.000 -3.06e+04 -1.1e+04
Q3_Egypt -1.605e+04 5647.730 -2.842 0.004 -2.71e+04 -4982.774
Q3_France 0 4110.578 0 1.000 -8057.517 8057.517
Q3_Germany 2.943e+04 3721.836 7.908 0.000 2.21e+04 3.67e+04
Q3_Ghana 0 1.13e+04 0 1.000 -2.21e+04 2.21e+04
Q3_Greece -1.008e+04 6573.158 -1.534 0.125 -2.3e+04 2800.364
Q3_India -1.364e+04 2158.235 -6.320 0.000 -1.79e+04 -9408.762
Q3_Indonesia -6613.1198 4980.712 -1.328 0.184 -1.64e+04 3150.026
Q3_Iran, Islamic Republic of... -2.322e+04 6106.703 -3.803 0.000 -3.52e+04 -1.13e+04
Q3_Ireland 1.629e+04 8926.541 1.824 0.068 -1211.571 3.38e+04
Q3_Israel 6.778e+04 6723.221 10.082 0.000 5.46e+04 8.1e+04
Q3_Italy -3291.3737 4199.447 -0.784 0.433 -1.15e+04 4940.343
Q3_Japan 4627.1024 3220.332 1.437 0.151 -1685.362 1.09e+04
Q3_Kenya -1.462e+04 6307.687 -2.318 0.020 -2.7e+04 -2259.632
Q3_Malaysia -8250.6329 7171.333 -1.151 0.250 -2.23e+04 5806.548
Q3_Mexico -1.304e+04 4727.024 -2.760 0.006 -2.23e+04 -3778.765
Q3_Morocco -1.599e+04 6870.934 -2.328 0.020 -2.95e+04 -2523.777
Q3_Nepal -6973.7885 1.15e+04 -0.605 0.545 -2.96e+04 1.56e+04
Q3_Netherlands 2.665e+04 5239.321 5.087 0.000 1.64e+04 3.69e+04
Q3_Nigeria -1.4e+04 3772.132 -3.711 0.000 -2.14e+04 -6605.740
Q3_Pakistan -1.3e+04 4996.248 -2.602 0.009 -2.28e+04 -3204.645
Q3_Peru -1.491e+04 6722.817 -2.218 0.027 -2.81e+04 -1731.599
Q3_Philippines -1.29e+04 7510.388 -1.718 0.086 -2.76e+04 1818.448
Q3_Poland -8300.2996 5512.930 -1.506 0.132 -1.91e+04 2506.095
Q3_Portugal -7485.7523 5799.759 -1.291 0.197 -1.89e+04 3882.883
Q3_Republic of Korea 0 8076.642 0 1.000 -1.58e+04 1.58e+04
Q3_Romania -1.346e+04 8467.008 -1.589 0.112 -3.01e+04 3141.280
Q3_Russia -1.483e+04 3303.962 -4.487 0.000 -2.13e+04 -8349.045
Q3_Saudi Arabia 0 7653.867 0 1.000 -1.5e+04 1.5e+04
Q3_Singapore 1.855e+04 5769.149 3.215 0.001 7236.425 2.99e+04
Q3_South Africa 0 6013.578 0 1.000 -1.18e+04 1.18e+04
Q3_South Korea 0 5766.424 0 1.000 -1.13e+04 1.13e+04
Q3_Spain 0 3812.461 0 1.000 -7473.152 7473.152
Q3_Sri Lanka -1.325e+04 8470.344 -1.564 0.118 -2.99e+04 3351.680
Q3_Sweden 1.31e+04 7170.317 1.827 0.068 -952.709 2.72e+04
Q3_Switzerland 7.549e+04 7586.516 9.950 0.000 6.06e+04 9.04e+04
Q3_Taiwan -1.04e+04 4975.790 -2.091 0.037 -2.02e+04 -650.887
Q3_Thailand -2104.3487 6287.450 -0.335 0.738 -1.44e+04 1.02e+04
Q3_Tunisia -5128.2497 7459.117 -0.688 0.492 -1.97e+04 9493.043
Q3_Turkey -1.626e+04 4399.990 -3.696 0.000 -2.49e+04 -7636.442
Q3_Ukraine -1.277e+04 5146.601 -2.481 0.013 -2.29e+04 -2679.326
Q3_United Arab Emirates 2.749e+04 7806.692 3.522 0.000 1.22e+04 4.28e+04
Q3_United Kingdom of Great Britain and Northern Ireland 3.812e+04 3320.197 11.482 0.000 3.16e+04 4.46e+04
Q3_United States of America 7.577e+04 2297.890 32.974 0.000 7.13e+04 8.03e+04
Q3_Viet Nam -1.548e+04 5895.553 -2.625 0.009 -2.7e+04 -3920.557
Q6_1-2 years -4748.7300 2923.570 -1.624 0.104 -1.05e+04 982.024
Q6_10-20 years 2.119e+04 3069.428 6.904 0.000 1.52e+04 2.72e+04
Q6_20+ years 2.923e+04 3188.047 9.168 0.000 2.3e+04 3.55e+04
Q6_3-5 years -2181.5607 2902.063 -0.752 0.452 -7870.158 3507.037
Q6_5-10 years 8524.6558 2976.632 2.864 0.004 2689.890 1.44e+04
Q6_< 1 years -4605.4944 3002.044 -1.534 0.125 -1.05e+04 1279.086
Q20_0-49 employees -8368.2556 1547.299 -5.408 0.000 -1.14e+04 -5335.255
Q20_10,000 or more employees 1.074e+04 1741.227 6.170 0.000 7330.148 1.42e+04
Q20_1000-9,999 employees 4940.7687 1767.952 2.795 0.005 1475.245 8406.292
Q20_250-999 employees 2276.2734 1931.768 1.178 0.239 -1510.360 6062.906
==============================================================================
Omnibus: 9307.711 Durbin-Watson: 1.978
Prob(Omnibus): 0.000 Jarque-Bera (JB): 436263.322
Skew: 4.085 Prob(JB): 0.00
Kurtosis: 33.418 Cond. No. 52.9
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
from sklearn.ensemble import RandomForestRegressor
# compare random forest feature importance (allows us to rank)
clf_rf = RandomForestRegressor()
clf_rf.fit(X, Y)
RandomForestRegressor()
feat_importances = pd.Series(clf_rf.feature_importances_, index=X.columns)
ax = feat_importances.nlargest(25).sort_values().plot(kind='barh', figsize=(6, 12))
ax.barh([7], feat_importances.loc['Q2_Woman'], color='red')
<BarContainer object of 1 artists>
Next, we attempt to build models that do not include what gender the survey participant belongs to. This is to serve as an extra check to see if the model would still predicts higher salaries for males even though the model did not know that the data was for males.
# build models for men and women independently. See how they estimate salary on the same data
# I think this is also a decent way to isolate individual effects of education, country, etc.
Women_Model = model_dummies[model_dummies.Q2_Woman == 1]
Men_Model = model_dummies[model_dummies.Q2_Woman == 0]
# create and train women's model
women_fin = qnums(['Q4', 'Q5', 'Q7', 'Q3', 'Q6', 'Q20'], Women_Model)
Y_W = Women_Model.aprox_salary
X_W = women_fin.drop(['Q4_I prefer not to answer', 'Q5_Other',
'Q7_OTHER_Other', 'Q3_Other', 'Q6_I have never written code', 'Q20_50-249 employees'], axis=1)
X_W = sm.add_constant(X_W)
Women_Model.head()
| salary_min | salary_max | aprox_salary | Q1_18-21 | Q1_22-24 | Q1_25-29 | Q1_30-34 | Q1_35-39 | Q1_40-44 | Q1_45-49 | Q1_50-54 | Q1_55-59 | Q1_60-69 | Q1_70+ | Q2_Man | Q2_Woman | Q3_Argentina | Q3_Australia | Q3_Bangladesh | Q3_Belarus | Q3_Belgium | Q3_Brazil | Q3_Canada | Q3_Chile | Q3_China | Q3_Colombia | Q3_Egypt | Q3_France | Q3_Germany | Q3_Ghana | Q3_Greece | Q3_India | Q3_Indonesia | Q3_Iran, Islamic Republic of... | Q3_Ireland | Q3_Israel | Q3_Italy | Q3_Japan | Q3_Kenya | Q3_Malaysia | Q3_Mexico | Q3_Morocco | Q3_Nepal | Q3_Netherlands | Q3_Nigeria | Q3_Other | Q3_Pakistan | Q3_Peru | Q3_Philippines | Q3_Poland | Q3_Portugal | Q3_Republic of Korea | Q3_Romania | Q3_Russia | Q3_Saudi Arabia | Q3_Singapore | Q3_South Africa | Q3_South Korea | Q3_Spain | Q3_Sri Lanka | Q3_Sweden | Q3_Switzerland | Q3_Taiwan | Q3_Thailand | Q3_Tunisia | Q3_Turkey | Q3_Ukraine | Q3_United Arab Emirates | Q3_United Kingdom of Great Britain and Northern Ireland | Q3_United States of America | Q3_Viet Nam | Q4_Bachelor’s degree | Q4_Doctoral degree | Q4_I prefer not to answer | Q4_Master’s degree | Q4_No formal education past high school | Q4_Professional degree | Q4_Some college/university study without earning a bachelor’s degree | Q5_Business Analyst | Q5_DBA/Database Engineer | Q5_Data Analyst | Q5_Data Engineer | Q5_Data Scientist | Q5_Machine Learning Engineer | Q5_Other | Q5_Product/Project Manager | Q5_Research Scientist | Q5_Software Engineer | Q5_Statistician | Q6_1-2 years | Q6_10-20 years | Q6_20+ years | Q6_3-5 years | Q6_5-10 years | Q6_< 1 years | Q6_I have never written code | Q7_Part_1_Python | Q7_Part_2_R | Q7_Part_3_SQL | Q7_Part_4_C | Q7_Part_5_C++ | Q7_Part_6_Java | Q7_Part_7_Javascript | Q7_Part_8_Julia | Q7_Part_9_Swift | Q7_Part_10_Bash | Q7_Part_11_MATLAB | Q7_Part_12_None | Q7_OTHER_Other | Q8_Bash | Q8_C | Q8_C++ | Q8_Java | Q8_Javascript | Q8_Julia | Q8_MATLAB | Q8_None | Q8_Other | Q8_Python | Q8_R | Q8_SQL | Q8_Swift | Q9_Part_1_Jupyter (JupyterLab, Jupyter Notebooks, etc) | Q9_Part_2_ RStudio | Q9_Part_3_Visual Studio | Q9_Part_4_Visual Studio Code (VSCode) | Q9_Part_5_ PyCharm | Q9_Part_6_ Spyder | Q9_Part_7_ Notepad++ | Q9_Part_8_ Sublime Text | Q9_Part_9_ Vim / Emacs | Q9_Part_10_ MATLAB | Q9_Part_11_None | Q9_OTHER_Other | Q10_Part_1_ Kaggle Notebooks | Q10_Part_2_Colab Notebooks | Q10_Part_3_Azure Notebooks | Q10_Part_4_ Paperspace / Gradient | Q10_Part_5_ Binder / JupyterHub | Q10_Part_6_ Code Ocean | Q10_Part_7_ IBM Watson Studio | Q10_Part_8_ Amazon Sagemaker Studio | Q10_Part_9_ Amazon EMR Notebooks | Q10_Part_10_Google Cloud AI Platform Notebooks | Q10_Part_11_Google Cloud Datalab Notebooks | Q10_Part_12_ Databricks Collaborative Notebooks | Q10_Part_13_None | Q10_OTHER_Other | Q11_A cloud computing platform (AWS, Azure, GCP, hosted notebooks, etc) | Q11_A deep learning workstation (NVIDIA GTX, LambdaLabs, etc) | Q11_A personal computer or laptop | Q11_None | Q11_Other | Q12_Part_1_GPUs | Q12_Part_2_TPUs | Q12_Part_3_None | Q12_OTHER_Other | Q13_2-5 times | Q13_6-25 times | Q13_More than 25 times | Q13_Never | Q13_Once | Q14_Part_1_ Matplotlib | Q14_Part_2_ Seaborn | Q14_Part_3_ Plotly / Plotly Express | Q14_Part_4_ Ggplot / ggplot2 | Q14_Part_5_ Shiny | Q14_Part_6_ D3 js | Q14_Part_7_ Altair | Q14_Part_8_ Bokeh | Q14_Part_9_ Geoplotlib | Q14_Part_10_ Leaflet / Folium | Q14_Part_11_None | Q14_OTHER_Other | Q15_1-2 years | Q15_10-20 years | Q15_2-3 years | Q15_20 or more years | Q15_3-4 years | Q15_4-5 years | Q15_5-10 years | Q15_I do not use machine learning methods | Q15_Under 1 year | Q16_Part_1_ Scikit-learn | Q16_Part_2_ TensorFlow | Q16_Part_3_ Keras | Q16_Part_4_ PyTorch | Q16_Part_5_ Fast.ai | Q16_Part_6_ MXNet | Q16_Part_7_ Xgboost | Q16_Part_8_ LightGBM | Q16_Part_9_ CatBoost | Q16_Part_10_ Prophet | Q16_Part_11_ H2O 3 | Q16_Part_12_ Caret | Q16_Part_13_ Tidymodels | Q16_Part_14_ JAX | Q16_Part_15_None | Q16_OTHER_Other | Q17_Part_1_Linear or Logistic Regression | Q17_Part_2_Decision Trees or Random Forests | Q17_Part_3_Gradient Boosting Machines (xgboost, lightgbm, etc) | Q17_Part_4_Bayesian Approaches | Q17_Part_5_Evolutionary Approaches | Q17_Part_6_Dense Neural Networks (MLPs, etc) | Q17_Part_7_Convolutional Neural Networks | Q17_Part_8_Generative Adversarial Networks | Q17_Part_9_Recurrent Neural Networks | Q17_Part_10_Transformer Networks (BERT, gpt-3, etc) | Q17_Part_11_None | Q17_OTHER_Other | Q18_Part_1_General purpose image/video tools (PIL, cv2, skimage, etc) | Q18_Part_2_Image segmentation methods (U-Net, Mask R-CNN, etc) | Q18_Part_3_Object detection methods (YOLOv3, RetinaNet, etc) | Q18_Part_4_Image classification and other general purpose networks (VGG, Inception, ResNet, ResNeXt, NASNet, EfficientNet, etc) | Q18_Part_5_Generative Networks (GAN, VAE, etc) | Q18_Part_6_None | Q18_OTHER_Other | Q19_Part_1_Word embeddings/vectors (GLoVe, fastText, word2vec) | Q19_Part_2_Encoder-decorder models (seq2seq, vanilla transformers) | Q19_Part_3_Contextualized embeddings (ELMo, CoVe) | Q19_Part_4_Transformer language models (GPT-3, BERT, XLnet, etc) | Q19_Part_5_None | Q19_OTHER_Other | Q20_0-49 employees | Q20_10,000 or more employees | Q20_1000-9,999 employees | Q20_250-999 employees | Q20_50-249 employees | Q21_0 | Q21_1-2 | Q21_10-14 | Q21_15-19 | Q21_20+ | Q21_3-4 | Q21_5-9 | Q22_I do not know | Q22_No (we do not use ML methods) | Q22_We are exploring ML methods (and may one day put a model into production) | Q22_We have well established ML methods (i.e., models in production for more than 2 years) | Q22_We recently started using ML methods (i.e., models in production for less than 2 years) | Q22_We use ML methods for generating insights (but do not put working models into production) | Q23_Part_1_Analyze and understand data to influence product or business decisions | Q23_Part_2_Build and/or run the data infrastructure that my business uses for storing, analyzing, and operationalizing data | Q23_Part_3_Build prototypes to explore applying machine learning to new areas | Q23_Part_4_Build and/or run a machine learning service that operationally improves my product or workflows | Q23_Part_5_Experimentation and iteration to improve existing ML models | Q23_Part_6_Do research that advances the state of the art of machine learning | Q23_Part_7_None of these activities are an important part of my role at work | Q23_OTHER_Other | ... | Q29_A_Part_10_Microsoft Azure Data Lake Storage | Q29_A_Part_11_Amazon Redshift | Q29_A_Part_12_Amazon Athena | Q29_A_Part_13_Amazon DynamoDB | Q29_A_Part_14_Google Cloud BigQuery | Q29_A_Part_15_Google Cloud SQL | Q29_A_Part_16_Google Cloud Firestore | Q29_A_Part_17_None | Q29_A_OTHER_Other | Q30_Amazon Athena | Q30_Amazon DynamoDB | Q30_Amazon Redshift | Q30_Google Cloud BigQuery | Q30_Google Cloud Firestore | Q30_Google Cloud SQL | Q30_IBM Db2 | Q30_Microsoft Access | Q30_Microsoft Azure Data Lake Storage | Q30_Microsoft SQL Server | Q30_MongoDB | Q30_MySQL | Q30_Oracle Database | Q30_Other | Q30_PostgresSQL | Q30_SQLite | Q30_Snowflake | Q31_A_Part_1_Amazon QuickSight | Q31_A_Part_2_Microsoft Power BI | Q31_A_Part_3_Google Data Studio | Q31_A_Part_4_Looker | Q31_A_Part_5_Tableau | Q31_A_Part_6_Salesforce | Q31_A_Part_7_Einstein Analytics | Q31_A_Part_8_Qlik | Q31_A_Part_9_Domo | Q31_A_Part_10_TIBCO Spotfire | Q31_A_Part_11_Alteryx | Q31_A_Part_12_Sisense | Q31_A_Part_13_SAP Analytics Cloud | Q31_A_Part_14_None | Q31_A_OTHER_Other | Q32_Alteryx | Q32_Amazon QuickSight | Q32_Domo | Q32_Einstein Analytics | Q32_Google Data Studio | Q32_Looker | Q32_Microsoft Power BI | Q32_Other | Q32_Qlik | Q32_SAP Analytics Cloud | Q32_Salesforce | Q32_Sisense | Q32_TIBCO Spotfire | Q32_Tableau | Q33_A_Part_1_Automated data augmentation (e.g. imgaug, albumentations) | Q33_A_Part_2_Automated feature engineering/selection (e.g. tpot, boruta_py) | Q33_A_Part_3_Automated model selection (e.g. auto-sklearn, xcessiv) | Q33_A_Part_4_Automated model architecture searches (e.g. darts, enas) | Q33_A_Part_5_Automated hyperparameter tuning (e.g. hyperopt, ray.tune, Vizier) | Q33_A_Part_6_Automation of full ML pipelines (e.g. Google AutoML, H20 Driverless AI) | Q33_A_Part_7_No / None | Q33_A_OTHER_Other | Q34_A_Part_1_ Google Cloud AutoML | Q34_A_Part_2_ H20 Driverless AI | Q34_A_Part_3_ Databricks AutoML | Q34_A_Part_4_ DataRobot AutoML | Q34_A_Part_5_ Tpot | Q34_A_Part_6_ Auto-Keras | Q34_A_Part_7_ Auto-Sklearn | Q34_A_Part_8_ Auto_ml | Q34_A_Part_9_ Xcessiv | Q34_A_Part_10_ MLbox | Q34_A_Part_11_No / None | Q34_A_OTHER_Other | Q35_A_Part_1_ Neptune.ai | Q35_A_Part_2_ Weights & Biases | Q35_A_Part_3_ Comet.ml | Q35_A_Part_4_ Sacred + Omniboard | Q35_A_Part_5_ TensorBoard | Q35_A_Part_6_ Guild.ai | Q35_A_Part_7_ Polyaxon | Q35_A_Part_8_ Trains | Q35_A_Part_9_ Domino Model Monitor | Q35_A_Part_10_No / None | Q35_A_OTHER_Other | Q36_Part_1_ Plotly Dash | Q36_Part_2_ Streamlit | Q36_Part_3_ NBViewer | Q36_Part_4_ GitHub | Q36_Part_5_ Personal blog | Q36_Part_6_ Kaggle | Q36_Part_7_ Colab | Q36_Part_8_ Shiny | Q36_Part_9_I do not share my work publicly | Q36_OTHER_Other | Q37_Part_1_Coursera | Q37_Part_2_edX | Q37_Part_3_Kaggle Learn Courses | Q37_Part_4_DataCamp | Q37_Part_5_Fast.ai | Q37_Part_6_Udacity | Q37_Part_7_Udemy | Q37_Part_8_LinkedIn Learning | Q37_Part_9_Cloud-certification programs (direct from AWS, Azure, GCP, or similar) | Q37_Part_10_University Courses (resulting in a university degree) | Q37_Part_11_None | Q37_OTHER_Other | Q38_Advanced statistical software (SPSS, SAS, etc.) | Q38_Basic statistical software (Microsoft Excel, Google Sheets, etc.) | Q38_Business intelligence software (Salesforce, Tableau, Spotfire, etc.) | Q38_Cloud-based data software & APIs (AWS, GCP, Azure, etc.) | Q38_Local development environments (RStudio, JupyterLab, etc.) | Q38_Other | Q39_Part_1_Twitter (data science influencers) | Q39_Part_2_Email newsletters (Data Elixir, O'Reilly Data & AI, etc) | Q39_Part_3_Reddit (r/machinelearning, etc) | Q39_Part_4_Kaggle (notebooks, forums, etc) | Q39_Part_5_Course Forums (forums.fast.ai, Coursera forums, etc) | Q39_Part_6_YouTube (Kaggle YouTube, Cloud AI Adventures, etc) | Q39_Part_7_Podcasts (Chai Time Data Science, O’Reilly Data Show, etc) | Q39_Part_8_Blogs (Towards Data Science, Analytics Vidhya, etc) | Q39_Part_9_Journal Publications (peer-reviewed journals, conference proceedings, etc) | Q39_Part_10_Slack Communities (ods.ai, kagglenoobs, etc) | Q39_Part_11_None | Q39_OTHER_Other | Q26_B_Part_1_ Amazon Web Services (AWS) | Q26_B_Part_2_ Microsoft Azure | Q26_B_Part_3_ Google Cloud Platform (GCP) | Q26_B_Part_4_ IBM Cloud / Red Hat | Q26_B_Part_5_ Oracle Cloud | Q26_B_Part_6_ SAP Cloud | Q26_B_Part_7_ VMware Cloud | Q26_B_Part_8_ Salesforce Cloud | Q26_B_Part_9_ Alibaba Cloud | Q26_B_Part_10_ Tencent Cloud | Q26_B_Part_11_None | Q26_B_OTHER_Other | Q27_B_Part_1_ Amazon EC2 | Q27_B_Part_2_ AWS Lambda | Q27_B_Part_3_ Amazon Elastic Container Service | Q27_B_Part_4_ Azure Cloud Services | Q27_B_Part_5_ Microsoft Azure Container Instances | Q27_B_Part_6_ Azure Functions | Q27_B_Part_7_ Google Cloud Compute Engine | Q27_B_Part_8_ Google Cloud Functions | Q27_B_Part_9_ Google Cloud Run | Q27_B_Part_10_ Google Cloud App Engine | Q27_B_Part_11_None | Q27_B_OTHER_Other | Q28_B_Part_1_ Amazon SageMaker | Q28_B_Part_2_ Amazon Forecast | Q28_B_Part_3_ Amazon Rekognition | Q28_B_Part_4_ Azure Machine Learning Studio | Q28_B_Part_5_ Azure Cognitive Services | Q28_B_Part_6_ Google Cloud AI Platform / Google Cloud ML Engine | Q28_B_Part_7_ Google Cloud Video AI | Q28_B_Part_8_ Google Cloud Natural Language | Q28_B_Part_9_ Google Cloud Vision AI | Q28_B_Part_10_None | Q28_B_OTHER_Other | Q29_B_Part_1_MySQL | Q29_B_Part_2_PostgresSQL | Q29_B_Part_3_SQLite | Q29_B_Part_4_Oracle Database | Q29_B_Part_5_MongoDB | Q29_B_Part_6_Snowflake | Q29_B_Part_7_IBM Db2 | Q29_B_Part_8_Microsoft SQL Server | Q29_B_Part_9_Microsoft Access | Q29_B_Part_10_Microsoft Azure Data Lake Storage | Q29_B_Part_11_Amazon Redshift | Q29_B_Part_12_Amazon Athena | Q29_B_Part_13_Amazon DynamoDB | Q29_B_Part_14_Google Cloud BigQuery | Q29_B_Part_15_Google Cloud SQL | Q29_B_Part_16_Google Cloud Firestore | Q29_B_Part_17_None | Q29_B_OTHER_Other | Q31_B_Part_1_Microsoft Power BI | Q31_B_Part_2_Amazon QuickSight | Q31_B_Part_3_Google Data Studio | Q31_B_Part_4_Looker | Q31_B_Part_5_Tableau | Q31_B_Part_6_Salesforce | Q31_B_Part_7_Einstein Analytics | Q31_B_Part_8_Qlik | Q31_B_Part_9_Domo | Q31_B_Part_10_TIBCO Spotfire | Q31_B_Part_11_Alteryx | Q31_B_Part_12_Sisense | Q31_B_Part_13_SAP Analytics Cloud | Q31_B_Part_14_None | Q31_B_OTHER_Other | Q33_B_Part_1_Automated data augmentation (e.g. imgaug, albumentations) | Q33_B_Part_2_Automated feature engineering/selection (e.g. tpot, boruta_py) | Q33_B_Part_3_Automated model selection (e.g. auto-sklearn, xcessiv) | Q33_B_Part_4_Automated model architecture searches (e.g. darts, enas) | Q33_B_Part_5_Automated hyperparameter tuning (e.g. hyperopt, ray.tune, Vizier) | Q33_B_Part_6_Automation of full ML pipelines (e.g. Google Cloud AutoML, H20 Driverless AI) | Q33_B_Part_7_None | Q33_B_OTHER_Other | Q34_B_Part_1_ Google Cloud AutoML | Q34_B_Part_2_ H20 Driverless AI | Q34_B_Part_3_ Databricks AutoML | Q34_B_Part_4_ DataRobot AutoML | Q34_B_Part_5_ Tpot | Q34_B_Part_6_ Auto-Keras | Q34_B_Part_7_ Auto-Sklearn | Q34_B_Part_8_ Auto_ml | Q34_B_Part_9_ Xcessiv | Q34_B_Part_10_ MLbox | Q34_B_Part_11_None | Q34_B_OTHER_Other | Q35_B_Part_1_ Neptune.ai | Q35_B_Part_2_ Weights & Biases | Q35_B_Part_3_ Comet.ml | Q35_B_Part_4_ Sacred + Omniboard | Q35_B_Part_5_ TensorBoard | Q35_B_Part_6_ Guild.ai | Q35_B_Part_7_ Polyaxon | Q35_B_Part_8_ Trains | Q35_B_Part_9_ Domino Model Monitor | Q35_B_Part_10_None | Q35_B_OTHER_Other | salary_cleaned_0-999 | salary_cleaned_1000-1999 | salary_cleaned_10000-14999 | salary_cleaned_100000-124999 | salary_cleaned_125000-149999 | salary_cleaned_15000-19999 | salary_cleaned_150000-199999 | salary_cleaned_2000-2999 | salary_cleaned_20000-24999 | salary_cleaned_200000-249999 | salary_cleaned_25000-29999 | salary_cleaned_250000-299999 | salary_cleaned_3000-3999 | salary_cleaned_30000-39999 | salary_cleaned_300000-500000 | salary_cleaned_4000-4999 | salary_cleaned_40000-49999 | salary_cleaned_5000-7499 | salary_cleaned_50000-59999 | salary_cleaned_500000 | salary_cleaned_60000-69999 | salary_cleaned_70000-79999 | salary_cleaned_7500-9999 | salary_cleaned_80000-89999 | salary_cleaned_90000-99999 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 21 | 1000 | 1999 | 1499.5 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 23 | 125000 | 149999 | 137499.5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 31 | 15000 | 19999 | 17499.5 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 1 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 1 | 1 | 0 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 35 | 125000 | 149999 | 137499.5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 45 | 1000 | 1999 | 1499.5 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 575 columns
model_W = sm.OLS(Y_W, X_W)
results_W = model_W.fit()
results_W.summary()
| Dep. Variable: | aprox_salary | R-squared: | 0.454 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.422 |
| Method: | Least Squares | F-statistic: | 14.37 |
| Date: | Wed, 27 Jul 2022 | Prob (F-statistic): | 5.16e-150 |
| Time: | 15:55:28 | Log-Likelihood: | -20267. |
| No. Observations: | 1683 | AIC: | 4.072e+04 |
| Df Residuals: | 1590 | BIC: | 4.122e+04 |
| Df Model: | 92 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| const | 8183.1110 | 1.07e+04 | 0.763 | 0.445 | -1.28e+04 | 2.92e+04 |
| Q4_Bachelor’s degree | 3056.2863 | 9321.455 | 0.328 | 0.743 | -1.52e+04 | 2.13e+04 |
| Q4_Doctoral degree | 1.365e+04 | 9532.967 | 1.432 | 0.152 | -5046.439 | 3.24e+04 |
| Q4_Master’s degree | 9813.5973 | 9257.062 | 1.060 | 0.289 | -8343.732 | 2.8e+04 |
| Q4_No formal education past high school | 4502.3068 | 2.37e+04 | 0.190 | 0.849 | -4.2e+04 | 5.1e+04 |
| Q4_Professional degree | 1.096e+04 | 1.04e+04 | 1.051 | 0.294 | -9504.093 | 3.14e+04 |
| Q4_Some college/university study without earning a bachelor’s degree | -2733.4509 | 1.16e+04 | -0.235 | 0.814 | -2.56e+04 | 2.01e+04 |
| Q5_Business Analyst | 8338.4117 | 4835.815 | 1.724 | 0.085 | -1146.832 | 1.78e+04 |
| Q5_DBA/Database Engineer | -8461.8924 | 1.3e+04 | -0.652 | 0.515 | -3.39e+04 | 1.7e+04 |
| Q5_Data Analyst | -505.3368 | 3921.993 | -0.129 | 0.897 | -8198.158 | 7187.485 |
| Q5_Data Engineer | 9645.7086 | 6837.554 | 1.411 | 0.159 | -3765.860 | 2.31e+04 |
| Q5_Data Scientist | 9812.0362 | 3762.486 | 2.608 | 0.009 | 2432.082 | 1.72e+04 |
| Q5_Machine Learning Engineer | 989.9825 | 5247.103 | 0.189 | 0.850 | -9301.985 | 1.13e+04 |
| Q5_Product/Project Manager | 2.074e+04 | 6223.540 | 3.333 | 0.001 | 8536.779 | 3.3e+04 |
| Q5_Research Scientist | -2633.8866 | 4540.896 | -0.580 | 0.562 | -1.15e+04 | 6272.886 |
| Q5_Software Engineer | 3043.3358 | 4454.516 | 0.683 | 0.495 | -5694.006 | 1.18e+04 |
| Q5_Statistician | -721.8901 | 6659.766 | -0.108 | 0.914 | -1.38e+04 | 1.23e+04 |
| Q7_Part_1_Python | -1987.9513 | 3034.027 | -0.655 | 0.512 | -7939.066 | 3963.163 |
| Q7_Part_2_R | -3503.0673 | 2547.904 | -1.375 | 0.169 | -8500.673 | 1494.538 |
| Q7_Part_3_SQL | -85.7876 | 2407.232 | -0.036 | 0.972 | -4807.470 | 4635.895 |
| Q7_Part_4_C | 3651.6790 | 3793.279 | 0.963 | 0.336 | -3788.675 | 1.11e+04 |
| Q7_Part_5_C++ | 441.8752 | 3593.246 | 0.123 | 0.902 | -6606.124 | 7489.874 |
| Q7_Part_6_Java | -521.6809 | 3375.162 | -0.155 | 0.877 | -7141.917 | 6098.555 |
| Q7_Part_7_Javascript | -4303.9568 | 3475.900 | -1.238 | 0.216 | -1.11e+04 | 2513.873 |
| Q7_Part_8_Julia | 4.335e+04 | 1.55e+04 | 2.803 | 0.005 | 1.3e+04 | 7.37e+04 |
| Q7_Part_9_Swift | 1.23e+04 | 1.65e+04 | 0.746 | 0.456 | -2e+04 | 4.46e+04 |
| Q7_Part_10_Bash | 8610.6046 | 4430.767 | 1.943 | 0.052 | -80.155 | 1.73e+04 |
| Q7_Part_11_MATLAB | -1.015e+04 | 3388.790 | -2.995 | 0.003 | -1.68e+04 | -3501.060 |
| Q7_Part_12_None | -2.098e+04 | 1.03e+04 | -2.039 | 0.042 | -4.12e+04 | -797.154 |
| Q3_Argentina | -1.308e+04 | 1.3e+04 | -1.007 | 0.314 | -3.86e+04 | 1.24e+04 |
| Q3_Australia | 5.741e+04 | 1.36e+04 | 4.220 | 0.000 | 3.07e+04 | 8.41e+04 |
| Q3_Bangladesh | -2.112e+04 | 2.5e+04 | -0.846 | 0.398 | -7.01e+04 | 2.79e+04 |
| Q3_Belarus | -8172.4748 | 1.96e+04 | -0.417 | 0.677 | -4.66e+04 | 3.03e+04 |
| Q3_Belgium | -1333.6895 | 1.96e+04 | -0.068 | 0.946 | -3.97e+04 | 3.7e+04 |
| Q3_Brazil | -1.87e+04 | 7479.288 | -2.500 | 0.013 | -3.34e+04 | -4027.631 |
| Q3_Canada | 3.986e+04 | 7989.572 | 4.989 | 0.000 | 2.42e+04 | 5.55e+04 |
| Q3_Chile | -1.113e+04 | 1.67e+04 | -0.665 | 0.506 | -4.4e+04 | 2.17e+04 |
| Q3_China | 4.314e+04 | 1.28e+04 | 3.379 | 0.001 | 1.81e+04 | 6.82e+04 |
| Q3_Colombia | -2.244e+04 | 1.36e+04 | -1.655 | 0.098 | -4.9e+04 | 4155.515 |
| Q3_Egypt | -1.541e+04 | 1.04e+04 | -1.477 | 0.140 | -3.59e+04 | 5060.862 |
| Q3_France | 1226.3523 | 8865.926 | 0.138 | 0.890 | -1.62e+04 | 1.86e+04 |
| Q3_Germany | 3.069e+04 | 8530.480 | 3.598 | 0.000 | 1.4e+04 | 4.74e+04 |
| Q3_Ghana | -1.327e+04 | 2.18e+04 | -0.609 | 0.543 | -5.6e+04 | 2.95e+04 |
| Q3_Greece | 324.6708 | 1.57e+04 | 0.021 | 0.983 | -3.04e+04 | 3.11e+04 |
| Q3_India | -1.568e+04 | 4775.143 | -3.283 | 0.001 | -2.5e+04 | -6311.992 |
| Q3_Indonesia | -3795.9217 | 8612.640 | -0.441 | 0.659 | -2.07e+04 | 1.31e+04 |
| Q3_Iran, Islamic Republic of... | -1.415e+04 | 1.07e+04 | -1.318 | 0.188 | -3.52e+04 | 6909.440 |
| Q3_Ireland | 1.289e+04 | 1.25e+04 | 1.028 | 0.304 | -1.17e+04 | 3.75e+04 |
| Q3_Israel | 2.485e+04 | 1.66e+04 | 1.496 | 0.135 | -7736.254 | 5.74e+04 |
| Q3_Italy | -160.2948 | 1.02e+04 | -0.016 | 0.987 | -2.02e+04 | 1.99e+04 |
| Q3_Japan | 1.827e+04 | 1.05e+04 | 1.736 | 0.083 | -2374.581 | 3.89e+04 |
| Q3_Kenya | -8390.0548 | 1.41e+04 | -0.594 | 0.553 | -3.61e+04 | 1.93e+04 |
| Q3_Malaysia | -1.608e+04 | 1.09e+04 | -1.469 | 0.142 | -3.76e+04 | 5389.447 |
| Q3_Mexico | -1.429e+04 | 1.22e+04 | -1.173 | 0.241 | -3.82e+04 | 9597.692 |
| Q3_Morocco | -8864.3305 | 1.43e+04 | -0.621 | 0.535 | -3.69e+04 | 1.91e+04 |
| Q3_Nepal | -1.226e+04 | 4.28e+04 | -0.287 | 0.774 | -9.62e+04 | 7.16e+04 |
| Q3_Netherlands | 3.962e+04 | 1.12e+04 | 3.544 | 0.000 | 1.77e+04 | 6.15e+04 |
| Q3_Nigeria | -1875.3215 | 7956.391 | -0.236 | 0.814 | -1.75e+04 | 1.37e+04 |
| Q3_Pakistan | -1.335e+04 | 1.05e+04 | -1.275 | 0.202 | -3.39e+04 | 7185.328 |
| Q3_Peru | -1.373e+04 | 1.41e+04 | -0.973 | 0.331 | -4.14e+04 | 1.4e+04 |
| Q3_Philippines | -1.36e+04 | 1.26e+04 | -1.082 | 0.279 | -3.83e+04 | 1.11e+04 |
| Q3_Poland | -8785.9664 | 1.3e+04 | -0.674 | 0.500 | -3.44e+04 | 1.68e+04 |
| Q3_Portugal | 4644.1599 | 1e+04 | 0.462 | 0.644 | -1.51e+04 | 2.43e+04 |
| Q3_Republic of Korea | -3.423e+04 | 3.04e+04 | -1.124 | 0.261 | -9.39e+04 | 2.55e+04 |
| Q3_Romania | -1.385e+04 | 1.69e+04 | -0.821 | 0.412 | -4.7e+04 | 1.93e+04 |
| Q3_Russia | -1.06e+04 | 8225.841 | -1.288 | 0.198 | -2.67e+04 | 5538.794 |
| Q3_Saudi Arabia | -1.634e+04 | 1.48e+04 | -1.106 | 0.269 | -4.53e+04 | 1.26e+04 |
| Q3_Singapore | 1.624e+04 | 1.18e+04 | 1.376 | 0.169 | -6911.940 | 3.94e+04 |
| Q3_South Africa | -1.537e+04 | 1.36e+04 | -1.132 | 0.258 | -4.2e+04 | 1.13e+04 |
| Q3_South Korea | -693.5168 | 1.26e+04 | -0.055 | 0.956 | -2.54e+04 | 2.41e+04 |
| Q3_Spain | -8311.6612 | 9000.940 | -0.923 | 0.356 | -2.6e+04 | 9343.297 |
| Q3_Sri Lanka | -1.669e+04 | 1.41e+04 | -1.181 | 0.238 | -4.44e+04 | 1.1e+04 |
| Q3_Sweden | 1.753e+04 | 1.57e+04 | 1.116 | 0.265 | -1.33e+04 | 4.83e+04 |
| Q3_Switzerland | 8.274e+04 | 1.57e+04 | 5.273 | 0.000 | 5.2e+04 | 1.14e+05 |
| Q3_Taiwan | -1647.5790 | 1.18e+04 | -0.139 | 0.889 | -2.48e+04 | 2.15e+04 |
| Q3_Thailand | -3744.9789 | 1.26e+04 | -0.296 | 0.767 | -2.85e+04 | 2.11e+04 |
| Q3_Tunisia | -9274.3647 | 1.04e+04 | -0.895 | 0.371 | -2.96e+04 | 1.11e+04 |
| Q3_Turkey | -1.576e+04 | 9693.729 | -1.626 | 0.104 | -3.48e+04 | 3253.614 |
| Q3_Ukraine | -7619.8623 | 1.25e+04 | -0.609 | 0.543 | -3.22e+04 | 1.69e+04 |
| Q3_United Arab Emirates | -1809.0733 | 1.95e+04 | -0.093 | 0.926 | -4e+04 | 3.64e+04 |
| Q3_United Kingdom of Great Britain and Northern Ireland | 1.879e+04 | 7060.924 | 2.661 | 0.008 | 4941.313 | 3.26e+04 |
| Q3_United States of America | 6.822e+04 | 4997.544 | 13.650 | 0.000 | 5.84e+04 | 7.8e+04 |
| Q3_Viet Nam | -7097.9739 | 1.3e+04 | -0.547 | 0.584 | -3.25e+04 | 1.83e+04 |
| Q6_1-2 years | 6341.5079 | 5187.947 | 1.222 | 0.222 | -3834.428 | 1.65e+04 |
| Q6_10-20 years | 2.96e+04 | 5915.886 | 5.003 | 0.000 | 1.8e+04 | 4.12e+04 |
| Q6_20+ years | 1.648e+04 | 6885.656 | 2.393 | 0.017 | 2971.925 | 3e+04 |
| Q6_3-5 years | 6092.0781 | 5212.061 | 1.169 | 0.243 | -4131.156 | 1.63e+04 |
| Q6_5-10 years | 1.456e+04 | 5479.547 | 2.657 | 0.008 | 3808.925 | 2.53e+04 |
| Q6_< 1 years | 5663.7839 | 5426.692 | 1.044 | 0.297 | -4980.440 | 1.63e+04 |
| Q20_0-49 employees | -1.177e+04 | 3257.520 | -3.614 | 0.000 | -1.82e+04 | -5383.053 |
| Q20_10,000 or more employees | 9435.4404 | 3805.842 | 2.479 | 0.013 | 1970.444 | 1.69e+04 |
| Q20_1000-9,999 employees | 2244.2847 | 3815.877 | 0.588 | 0.557 | -5240.394 | 9728.963 |
| Q20_250-999 employees | 4485.5733 | 3912.840 | 1.146 | 0.252 | -3189.294 | 1.22e+04 |
| Omnibus: | 1732.338 | Durbin-Watson: | 1.993 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 155341.130 |
| Skew: | 4.813 | Prob(JB): | 0.00 |
| Kurtosis: | 49.071 | Cond. No. | 72.6 |
results_reg_W = model_W.fit_regularized(L1_wt=1, alpha=5)
final_W = sm.regression.linear_model.OLSResults(model_W, results_reg_W.params, model_W.normalized_cov_params)
print(final_W.summary())
OLS Regression Results
==============================================================================
Dep. Variable: aprox_salary R-squared: 0.453
Model: OLS Adj. R-squared: 0.421
Method: Least Squares F-statistic: 14.30
Date: Wed, 27 Jul 2022 Prob (F-statistic): 2.41e-149
Time: 15:55:30 Log-Likelihood: -20268.
No. Observations: 1683 AIC: 4.072e+04
Df Residuals: 1590 BIC: 4.123e+04
Df Model: 92
Covariance Type: nonrobust
========================================================================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------------------------------------------------------
const 1.789e+04 1.07e+04 1.667 0.096 -3164.903 3.89e+04
Q4_Bachelor’s degree -753.1711 9331.157 -0.081 0.936 -1.91e+04 1.75e+04
Q4_Doctoral degree 9596.4453 9542.889 1.006 0.315 -9121.522 2.83e+04
Q4_Master’s degree 5866.7117 9266.697 0.633 0.527 -1.23e+04 2.4e+04
Q4_No formal education past high school 0 2.37e+04 0 1.000 -4.65e+04 4.65e+04
Q4_Professional degree 6823.2280 1.04e+04 0.653 0.514 -1.37e+04 2.73e+04
Q4_Some college/university study without earning a bachelor’s degree -7174.8753 1.17e+04 -0.616 0.538 -3e+04 1.57e+04
Q5_Business Analyst 6725.2876 4840.848 1.389 0.165 -2769.828 1.62e+04
Q5_DBA/Database Engineer -9487.4445 1.3e+04 -0.730 0.465 -3.5e+04 1.6e+04
Q5_Data Analyst -1691.5593 3926.075 -0.431 0.667 -9392.387 6009.269
Q5_Data Engineer 7962.8922 6844.670 1.163 0.245 -5462.635 2.14e+04
Q5_Data Scientist 8706.3022 3766.402 2.312 0.021 1318.667 1.61e+04
Q5_Machine Learning Engineer -108.8149 5252.564 -0.021 0.983 -1.04e+04 1.02e+04
Q5_Product/Project Manager 1.914e+04 6230.018 3.073 0.002 6923.522 3.14e+04
Q5_Research Scientist -3598.9057 4545.622 -0.792 0.429 -1.25e+04 5317.137
Q5_Software Engineer 0 4459.152 0 1.000 -8746.436 8746.436
Q5_Statistician -1242.9406 6666.697 -0.186 0.852 -1.43e+04 1.18e+04
Q7_Part_1_Python -350.7443 3037.185 -0.115 0.908 -6308.053 5606.564
Q7_Part_2_R -2951.3041 2550.556 -1.157 0.247 -7954.111 2051.503
Q7_Part_3_SQL 511.7946 2409.737 0.212 0.832 -4214.802 5238.391
Q7_Part_4_C 3631.3520 3797.227 0.956 0.339 -3816.746 1.11e+04
Q7_Part_5_C++ 747.8751 3596.986 0.208 0.835 -6307.459 7803.209
Q7_Part_6_Java -117.4515 3378.675 -0.035 0.972 -6744.578 6509.675
Q7_Part_7_Javascript -3735.5488 3479.518 -1.074 0.283 -1.06e+04 3089.377
Q7_Part_8_Julia 4.167e+04 1.55e+04 2.691 0.007 1.13e+04 7.2e+04
Q7_Part_9_Swift 1.138e+04 1.65e+04 0.690 0.490 -2.1e+04 4.37e+04
Q7_Part_10_Bash 8793.7454 4435.378 1.983 0.048 93.941 1.75e+04
Q7_Part_11_MATLAB -9975.0989 3392.317 -2.940 0.003 -1.66e+04 -3321.215
Q7_Part_12_None -1.92e+04 1.03e+04 -1.863 0.063 -3.94e+04 1010.236
Q3_Argentina -1.237e+04 1.3e+04 -0.951 0.342 -3.79e+04 1.31e+04
Q3_Australia 5.639e+04 1.36e+04 4.141 0.000 2.97e+04 8.31e+04
Q3_Bangladesh -1.869e+04 2.5e+04 -0.747 0.455 -6.77e+04 3.04e+04
Q3_Belarus -5627.2862 1.96e+04 -0.287 0.774 -4.41e+04 3.28e+04
Q3_Belgium -1556.0862 1.96e+04 -0.080 0.937 -3.99e+04 3.68e+04
Q3_Brazil -1.907e+04 7487.072 -2.548 0.011 -3.38e+04 -4387.965
Q3_Canada 3.929e+04 7997.887 4.912 0.000 2.36e+04 5.5e+04
Q3_Chile -1.023e+04 1.68e+04 -0.611 0.542 -4.31e+04 2.26e+04
Q3_China 4.215e+04 1.28e+04 3.298 0.001 1.71e+04 6.72e+04
Q3_Colombia -2.2e+04 1.36e+04 -1.620 0.105 -4.86e+04 4631.461
Q3_Egypt -1.558e+04 1.04e+04 -1.492 0.136 -3.61e+04 4909.550
Q3_France 0 8875.153 0 1.000 -1.74e+04 1.74e+04
Q3_Germany 3.015e+04 8539.358 3.531 0.000 1.34e+04 4.69e+04
Q3_Ghana -1.107e+04 2.18e+04 -0.508 0.612 -5.39e+04 3.17e+04
Q3_Greece 0 1.57e+04 0 1.000 -3.08e+04 3.08e+04
Q3_India -1.598e+04 4780.113 -3.343 0.001 -2.54e+04 -6603.555
Q3_Indonesia -3832.6656 8621.604 -0.445 0.657 -2.07e+04 1.31e+04
Q3_Iran, Islamic Republic of... -1.371e+04 1.07e+04 -1.276 0.202 -3.48e+04 7367.452
Q3_Ireland 1.155e+04 1.26e+04 0.920 0.358 -1.31e+04 3.62e+04
Q3_Israel 2.377e+04 1.66e+04 1.429 0.153 -8848.791 5.64e+04
Q3_Italy 0 1.02e+04 0 1.000 -2.01e+04 2.01e+04
Q3_Japan 1.75e+04 1.05e+04 1.661 0.097 -3167.008 3.82e+04
Q3_Kenya -8354.6166 1.41e+04 -0.591 0.555 -3.61e+04 1.94e+04
Q3_Malaysia -1.543e+04 1.1e+04 -1.408 0.159 -3.69e+04 6061.673
Q3_Mexico -1.334e+04 1.22e+04 -1.094 0.274 -3.72e+04 1.06e+04
Q3_Morocco -9146.4432 1.43e+04 -0.640 0.522 -3.72e+04 1.89e+04
Q3_Nepal -1510.1364 4.28e+04 -0.035 0.972 -8.55e+04 8.25e+04
Q3_Netherlands 3.937e+04 1.12e+04 3.518 0.000 1.74e+04 6.13e+04
Q3_Nigeria -1605.1449 7964.672 -0.202 0.840 -1.72e+04 1.4e+04
Q3_Pakistan -1.277e+04 1.05e+04 -1.219 0.223 -3.33e+04 7784.302
Q3_Peru -1.304e+04 1.41e+04 -0.923 0.356 -4.07e+04 1.47e+04
Q3_Philippines -1.396e+04 1.26e+04 -1.109 0.267 -3.86e+04 1.07e+04
Q3_Poland -8176.3336 1.31e+04 -0.626 0.531 -3.38e+04 1.74e+04
Q3_Portugal 0 1.01e+04 0 1.000 -1.97e+04 1.97e+04
Q3_Republic of Korea -3.065e+04 3.05e+04 -1.006 0.315 -9.04e+04 2.91e+04
Q3_Romania -1.357e+04 1.69e+04 -0.804 0.422 -4.67e+04 1.96e+04
Q3_Russia -9992.5948 8234.403 -1.214 0.225 -2.61e+04 6158.833
Q3_Saudi Arabia -1.591e+04 1.48e+04 -1.076 0.282 -4.49e+04 1.31e+04
Q3_Singapore 1.624e+04 1.18e+04 1.375 0.169 -6932.531 3.94e+04
Q3_South Africa -1.474e+04 1.36e+04 -1.085 0.278 -4.14e+04 1.19e+04
Q3_South Korea -356.3269 1.26e+04 -0.028 0.978 -2.51e+04 2.44e+04
Q3_Spain -8469.1964 9010.309 -0.940 0.347 -2.61e+04 9204.137
Q3_Sri Lanka -1.605e+04 1.42e+04 -1.134 0.257 -4.38e+04 1.17e+04
Q3_Sweden 1.644e+04 1.57e+04 1.045 0.296 -1.44e+04 4.73e+04
Q3_Switzerland 8.221e+04 1.57e+04 5.233 0.000 5.14e+04 1.13e+05
Q3_Taiwan -1097.3784 1.18e+04 -0.093 0.926 -2.43e+04 2.21e+04
Q3_Thailand -3964.0245 1.27e+04 -0.313 0.754 -2.88e+04 2.09e+04
Q3_Tunisia -9682.4769 1.04e+04 -0.933 0.351 -3e+04 1.07e+04
Q3_Turkey -1.53e+04 9703.819 -1.577 0.115 -3.43e+04 3731.810
Q3_Ukraine -6711.1602 1.25e+04 -0.536 0.592 -3.13e+04 1.79e+04
Q3_United Arab Emirates -1369.2247 1.95e+04 -0.070 0.944 -3.96e+04 3.69e+04
Q3_United Kingdom of Great Britain and Northern Ireland 1.846e+04 7068.273 2.612 0.009 4598.307 3.23e+04
Q3_United States of America 6.793e+04 5002.745 13.578 0.000 5.81e+04 7.77e+04
Q3_Viet Nam -7301.3795 1.3e+04 -0.562 0.574 -3.28e+04 1.82e+04
Q6_1-2 years 1280.2710 5193.347 0.247 0.805 -8906.256 1.15e+04
Q6_10-20 years 2.457e+04 5922.044 4.149 0.000 1.3e+04 3.62e+04
Q6_20+ years 1.149e+04 6892.822 1.667 0.096 -2027.136 2.5e+04
Q6_3-5 years 0 5217.486 0 1.000 -1.02e+04 1.02e+04
Q6_5-10 years 9523.1238 5485.250 1.736 0.083 -1235.958 2.03e+04
Q6_< 1 years 411.6425 5432.340 0.076 0.940 -1.02e+04 1.11e+04
Q20_0-49 employees -1.337e+04 3260.910 -4.100 0.000 -1.98e+04 -6973.835
Q20_10,000 or more employees 8450.7741 3809.803 2.218 0.027 978.008 1.59e+04
Q20_1000-9,999 employees 0 3819.848 0 1.000 -7492.468 7492.468
Q20_250-999 employees 3166.2161 3916.913 0.808 0.419 -4516.640 1.08e+04
==============================================================================
Omnibus: 1736.731 Durbin-Watson: 1.998
Prob(Omnibus): 0.000 Jarque-Bera (JB): 156588.973
Skew: 4.832 Prob(JB): 0.00
Kurtosis: 49.256 Cond. No. 72.6
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
# create and train men's model
men_fin = qnums(['Q4', 'Q5', 'Q7', 'Q3', 'Q6', 'Q20'], Men_Model)
Y_M = Men_Model.aprox_salary
X_M = men_fin.drop(['Q4_I prefer not to answer', 'Q5_Other',
'Q7_OTHER_Other', 'Q3_Other', 'Q6_I have never written code', 'Q20_50-249 employees'], axis=1)
X_M = sm.add_constant(X_M)
model_M = sm.OLS(Y_M, X_M)
results_M = model_M.fit()
results_M.summary()
| Dep. Variable: | aprox_salary | R-squared: | 0.386 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.379 |
| Method: | Least Squares | F-statistic: | 59.90 |
| Date: | Wed, 27 Jul 2022 | Prob (F-statistic): | 0.00 |
| Time: | 15:55:30 | Log-Likelihood: | -1.0891e+05 |
| No. Observations: | 8872 | AIC: | 2.180e+05 |
| Df Residuals: | 8779 | BIC: | 2.187e+05 |
| Df Model: | 92 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| const | 2.113e+04 | 5925.627 | 3.565 | 0.000 | 9509.814 | 3.27e+04 |
| Q4_Bachelor’s degree | 2269.9925 | 4888.071 | 0.464 | 0.642 | -7311.771 | 1.19e+04 |
| Q4_Doctoral degree | 1.03e+04 | 5068.551 | 2.032 | 0.042 | 365.453 | 2.02e+04 |
| Q4_Master’s degree | 5361.5291 | 4853.175 | 1.105 | 0.269 | -4151.832 | 1.49e+04 |
| Q4_No formal education past high school | -4760.1632 | 7067.155 | -0.674 | 0.501 | -1.86e+04 | 9093.116 |
| Q4_Professional degree | 8982.4124 | 5472.166 | 1.641 | 0.101 | -1744.315 | 1.97e+04 |
| Q4_Some college/university study without earning a bachelor’s degree | 43.3111 | 5553.019 | 0.008 | 0.994 | -1.08e+04 | 1.09e+04 |
| Q5_Business Analyst | -2030.1665 | 2720.312 | -0.746 | 0.456 | -7362.616 | 3302.283 |
| Q5_DBA/Database Engineer | -8956.3773 | 5665.434 | -1.581 | 0.114 | -2.01e+04 | 2149.200 |
| Q5_Data Analyst | -1.004e+04 | 2301.624 | -4.364 | 0.000 | -1.46e+04 | -5532.784 |
| Q5_Data Engineer | -5313.9576 | 3377.355 | -1.573 | 0.116 | -1.19e+04 | 1306.450 |
| Q5_Data Scientist | 4194.0621 | 1985.915 | 2.112 | 0.035 | 301.204 | 8086.920 |
| Q5_Machine Learning Engineer | 1835.2211 | 2471.241 | 0.743 | 0.458 | -3008.990 | 6679.432 |
| Q5_Product/Project Manager | 9501.9436 | 2762.356 | 3.440 | 0.001 | 4087.079 | 1.49e+04 |
| Q5_Research Scientist | -8548.2046 | 2554.309 | -3.347 | 0.001 | -1.36e+04 | -3541.161 |
| Q5_Software Engineer | -6785.4557 | 2225.012 | -3.050 | 0.002 | -1.11e+04 | -2423.911 |
| Q5_Statistician | -6386.1253 | 4197.408 | -1.521 | 0.128 | -1.46e+04 | 1841.778 |
| Q7_Part_1_Python | 1221.7586 | 1805.164 | 0.677 | 0.499 | -2316.786 | 4760.303 |
| Q7_Part_2_R | 0.2495 | 1408.716 | 0.000 | 1.000 | -2761.163 | 2761.662 |
| Q7_Part_3_SQL | 1311.2347 | 1261.566 | 1.039 | 0.299 | -1161.730 | 3784.200 |
| Q7_Part_4_C | -495.6441 | 1897.756 | -0.261 | 0.794 | -4215.691 | 3224.403 |
| Q7_Part_5_C++ | -2469.2897 | 1741.525 | -1.418 | 0.156 | -5883.087 | 944.507 |
| Q7_Part_6_Java | 215.4759 | 1663.898 | 0.130 | 0.897 | -3046.154 | 3477.106 |
| Q7_Part_7_Javascript | -3106.4598 | 1634.430 | -1.901 | 0.057 | -6310.325 | 97.405 |
| Q7_Part_8_Julia | 7160.4571 | 4212.060 | 1.700 | 0.089 | -1096.166 | 1.54e+04 |
| Q7_Part_9_Swift | 4699.3419 | 4898.201 | 0.959 | 0.337 | -4902.280 | 1.43e+04 |
| Q7_Part_10_Bash | 7388.5771 | 1710.986 | 4.318 | 0.000 | 4034.644 | 1.07e+04 |
| Q7_Part_11_MATLAB | -4377.7019 | 1976.712 | -2.215 | 0.027 | -8252.521 | -502.883 |
| Q7_Part_12_None | -6980.8375 | 6136.348 | -1.138 | 0.255 | -1.9e+04 | 5047.841 |
| Q3_Argentina | -1.951e+04 | 6620.102 | -2.947 | 0.003 | -3.25e+04 | -6533.388 |
| Q3_Australia | 4.908e+04 | 5045.418 | 9.728 | 0.000 | 3.92e+04 | 5.9e+04 |
| Q3_Bangladesh | -1.612e+04 | 8235.724 | -1.957 | 0.050 | -3.23e+04 | 22.792 |
| Q3_Belarus | -1.716e+04 | 9465.935 | -1.813 | 0.070 | -3.57e+04 | 1393.318 |
| Q3_Belgium | 2.052e+04 | 9765.840 | 2.101 | 0.036 | 1371.949 | 3.97e+04 |
| Q3_Brazil | -1.275e+04 | 3322.699 | -3.837 | 0.000 | -1.93e+04 | -6236.328 |
| Q3_Canada | 3.551e+04 | 4651.266 | 7.635 | 0.000 | 2.64e+04 | 4.46e+04 |
| Q3_Chile | -7461.8209 | 7342.751 | -1.016 | 0.310 | -2.19e+04 | 6931.691 |
| Q3_China | 5485.7584 | 4756.146 | 1.153 | 0.249 | -3837.403 | 1.48e+04 |
| Q3_Colombia | -1.808e+04 | 5405.063 | -3.345 | 0.001 | -2.87e+04 | -7483.048 |
| Q3_Egypt | -1.377e+04 | 6535.870 | -2.108 | 0.035 | -2.66e+04 | -962.984 |
| Q3_France | 9273.2192 | 4584.598 | 2.023 | 0.043 | 286.334 | 1.83e+04 |
| Q3_Germany | 3.186e+04 | 4110.339 | 7.751 | 0.000 | 2.38e+04 | 3.99e+04 |
| Q3_Ghana | 1.485e+04 | 1.28e+04 | 1.156 | 0.248 | -1.03e+04 | 4e+04 |
| Q3_Greece | -1.012e+04 | 7215.482 | -1.402 | 0.161 | -2.43e+04 | 4025.118 |
| Q3_India | -1.04e+04 | 2403.673 | -4.327 | 0.000 | -1.51e+04 | -5688.589 |
| Q3_Indonesia | -5605.0097 | 5915.351 | -0.948 | 0.343 | -1.72e+04 | 5990.463 |
| Q3_Iran, Islamic Republic of... | -2.413e+04 | 7174.963 | -3.363 | 0.001 | -3.82e+04 | -1.01e+04 |
| Q3_Ireland | 2.33e+04 | 1.16e+04 | 2.010 | 0.044 | 578.371 | 4.6e+04 |
| Q3_Israel | 7.638e+04 | 7339.383 | 10.407 | 0.000 | 6.2e+04 | 9.08e+04 |
| Q3_Italy | -1457.3530 | 4597.576 | -0.317 | 0.751 | -1.05e+04 | 7554.973 |
| Q3_Japan | 7379.6536 | 3454.566 | 2.136 | 0.033 | 607.895 | 1.42e+04 |
| Q3_Kenya | -1.345e+04 | 6989.844 | -1.924 | 0.054 | -2.71e+04 | 256.236 |
| Q3_Malaysia | -4198.0675 | 8956.114 | -0.469 | 0.639 | -2.18e+04 | 1.34e+04 |
| Q3_Mexico | -1.024e+04 | 5138.369 | -1.992 | 0.046 | -2.03e+04 | -162.675 |
| Q3_Morocco | -1.481e+04 | 7717.073 | -1.919 | 0.055 | -2.99e+04 | 315.021 |
| Q3_Nepal | -6426.2255 | 1.22e+04 | -0.528 | 0.597 | -3.03e+04 | 1.74e+04 |
| Q3_Netherlands | 2.784e+04 | 5853.721 | 4.757 | 0.000 | 1.64e+04 | 3.93e+04 |
| Q3_Nigeria | -1.39e+04 | 4229.207 | -3.287 | 0.001 | -2.22e+04 | -5609.298 |
| Q3_Pakistan | -1.125e+04 | 5604.261 | -2.008 | 0.045 | -2.22e+04 | -267.485 |
| Q3_Peru | -1.354e+04 | 7533.743 | -1.798 | 0.072 | -2.83e+04 | 1223.724 |
| Q3_Philippines | -1.138e+04 | 8957.029 | -1.271 | 0.204 | -2.89e+04 | 6173.939 |
| Q3_Poland | -6122.8646 | 6057.455 | -1.011 | 0.312 | -1.8e+04 | 5751.165 |
| Q3_Portugal | -9317.8462 | 6858.692 | -1.359 | 0.174 | -2.28e+04 | 4126.796 |
| Q3_Republic of Korea | 7704.1139 | 8521.600 | 0.904 | 0.366 | -9000.218 | 2.44e+04 |
| Q3_Romania | -1.253e+04 | 9617.143 | -1.303 | 0.193 | -3.14e+04 | 6320.211 |
| Q3_Russia | -1.255e+04 | 3610.544 | -3.475 | 0.001 | -1.96e+04 | -5469.820 |
| Q3_Saudi Arabia | 5666.2681 | 8731.060 | 0.649 | 0.516 | -1.14e+04 | 2.28e+04 |
| Q3_Singapore | 2.208e+04 | 6498.394 | 3.397 | 0.001 | 9339.378 | 3.48e+04 |
| Q3_South Africa | 9024.2140 | 6654.557 | 1.356 | 0.175 | -4020.276 | 2.21e+04 |
| Q3_South Korea | 1.092e+04 | 6419.913 | 1.701 | 0.089 | -1662.134 | 2.35e+04 |
| Q3_Spain | 7000.2910 | 4191.759 | 1.670 | 0.095 | -1216.539 | 1.52e+04 |
| Q3_Sri Lanka | -1.13e+04 | 1.01e+04 | -1.119 | 0.263 | -3.11e+04 | 8497.036 |
| Q3_Sweden | 1.589e+04 | 7978.751 | 1.992 | 0.046 | 253.579 | 3.15e+04 |
| Q3_Switzerland | 7.759e+04 | 8526.711 | 9.100 | 0.000 | 6.09e+04 | 9.43e+04 |
| Q3_Taiwan | -8792.4576 | 5467.922 | -1.608 | 0.108 | -1.95e+04 | 1925.950 |
| Q3_Thailand | 659.7027 | 7112.799 | 0.093 | 0.926 | -1.33e+04 | 1.46e+04 |
| Q3_Tunisia | -1028.3530 | 9927.256 | -0.104 | 0.917 | -2.05e+04 | 1.84e+04 |
| Q3_Turkey | -1.416e+04 | 4886.247 | -2.898 | 0.004 | -2.37e+04 | -4584.457 |
| Q3_Ukraine | -1.139e+04 | 5632.696 | -2.021 | 0.043 | -2.24e+04 | -343.826 |
| Q3_United Arab Emirates | 3.535e+04 | 8511.247 | 4.154 | 0.000 | 1.87e+04 | 5.2e+04 |
| Q3_United Kingdom of Great Britain and Northern Ireland | 4.465e+04 | 3714.245 | 12.020 | 0.000 | 3.74e+04 | 5.19e+04 |
| Q3_United States of America | 8.058e+04 | 2567.322 | 31.387 | 0.000 | 7.55e+04 | 8.56e+04 |
| Q3_Viet Nam | -1.468e+04 | 6549.363 | -2.241 | 0.025 | -2.75e+04 | -1841.491 |
| Q6_1-2 years | -3353.2886 | 3444.738 | -0.973 | 0.330 | -1.01e+04 | 3399.206 |
| Q6_10-20 years | 2.358e+04 | 3567.115 | 6.609 | 0.000 | 1.66e+04 | 3.06e+04 |
| Q6_20+ years | 3.303e+04 | 3670.421 | 8.998 | 0.000 | 2.58e+04 | 4.02e+04 |
| Q6_3-5 years | 68.2991 | 3409.108 | 0.020 | 0.984 | -6614.351 | 6750.950 |
| Q6_5-10 years | 1.127e+04 | 3483.048 | 3.235 | 0.001 | 4438.370 | 1.81e+04 |
| Q6_< 1 years | -3146.2278 | 3522.090 | -0.893 | 0.372 | -1.01e+04 | 3757.893 |
| Q20_0-49 employees | -6490.0454 | 1740.296 | -3.729 | 0.000 | -9901.434 | -3078.657 |
| Q20_10,000 or more employees | 1.157e+04 | 1943.684 | 5.953 | 0.000 | 7760.545 | 1.54e+04 |
| Q20_1000-9,999 employees | 6397.8451 | 1977.474 | 3.235 | 0.001 | 2521.533 | 1.03e+04 |
| Q20_250-999 employees | 2536.9304 | 2187.703 | 1.160 | 0.246 | -1751.481 | 6825.341 |
| Omnibus: | 7619.234 | Durbin-Watson: | 2.002 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 320551.693 |
| Skew: | 3.950 | Prob(JB): | 0.00 |
| Kurtosis: | 31.368 | Cond. No. | 52.5 |
results_reg_M = model_M.fit_regularized(L1_wt=1, alpha= 5)
final_M = sm.regression.linear_model.OLSResults(model_M, results_reg_M.params, model_M.normalized_cov_params)
print(final_M.summary())
OLS Regression Results
==============================================================================
Dep. Variable: aprox_salary R-squared: 0.384
Model: OLS Adj. R-squared: 0.378
Method: Least Squares F-statistic: 59.54
Date: Wed, 27 Jul 2022 Prob (F-statistic): 0.00
Time: 15:55:33 Log-Likelihood: -1.0892e+05
No. Observations: 8872 AIC: 2.180e+05
Df Residuals: 8779 BIC: 2.187e+05
Df Model: 92
Covariance Type: nonrobust
========================================================================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------------------------------------------------------
const 3.468e+04 5932.655 5.846 0.000 2.31e+04 4.63e+04
Q4_Bachelor’s degree 0 4893.868 0 1.000 -9593.128 9593.128
Q4_Doctoral degree 8264.3993 5074.562 1.629 0.103 -1682.931 1.82e+04
Q4_Master’s degree 3271.3199 4858.931 0.673 0.501 -6253.324 1.28e+04
Q4_No formal education past high school -6537.4051 7075.537 -0.924 0.356 -2.04e+04 7332.304
Q4_Professional degree 6278.9759 5478.656 1.146 0.252 -4460.474 1.7e+04
Q4_Some college/university study without earning a bachelor’s degree -2260.4077 5559.604 -0.407 0.684 -1.32e+04 8637.719
Q5_Business Analyst -1998.0261 2723.539 -0.734 0.463 -7336.800 3340.747
Q5_DBA/Database Engineer -8095.6059 5672.153 -1.427 0.154 -1.92e+04 3023.143
Q5_Data Analyst -9735.0570 2304.354 -4.225 0.000 -1.43e+04 -5217.983
Q5_Data Engineer -4610.5224 3381.361 -1.364 0.173 -1.12e+04 2017.737
Q5_Data Scientist 4763.8449 1988.270 2.396 0.017 866.370 8661.320
Q5_Machine Learning Engineer 2424.3817 2474.172 0.980 0.327 -2425.574 7274.338
Q5_Product/Project Manager 9614.6890 2765.632 3.476 0.001 4193.403 1.5e+04
Q5_Research Scientist -7804.6728 2557.338 -3.052 0.002 -1.28e+04 -2791.691
Q5_Software Engineer -5616.1137 2227.651 -2.521 0.012 -9982.831 -1249.396
Q5_Statistician -5766.4457 4202.386 -1.372 0.170 -1.4e+04 2471.216
Q7_Part_1_Python 3770.5574 1807.305 2.086 0.037 227.816 7313.299
Q7_Part_2_R 563.6165 1410.387 0.400 0.689 -2201.071 3328.304
Q7_Part_3_SQL 1653.6484 1263.062 1.309 0.190 -822.250 4129.546
Q7_Part_4_C -113.3245 1900.007 -0.060 0.952 -3837.783 3611.134
Q7_Part_5_C++ -2430.9730 1743.590 -1.394 0.163 -5848.819 986.873
Q7_Part_6_Java 231.6885 1665.871 0.139 0.889 -3033.810 3497.187
Q7_Part_7_Javascript -2995.9778 1636.368 -1.831 0.067 -6203.643 211.687
Q7_Part_8_Julia 6852.0547 4217.055 1.625 0.104 -1414.361 1.51e+04
Q7_Part_9_Swift 4880.6944 4904.011 0.995 0.320 -4732.315 1.45e+04
Q7_Part_10_Bash 7241.8094 1713.015 4.228 0.000 3883.898 1.06e+04
Q7_Part_11_MATLAB -4363.9347 1979.056 -2.205 0.027 -8243.349 -484.521
Q7_Part_12_None -3684.8010 6143.625 -0.600 0.549 -1.57e+04 8358.144
Q3_Argentina -2.28e+04 6627.953 -3.440 0.001 -3.58e+04 -9804.740
Q3_Australia 4.531e+04 5051.402 8.970 0.000 3.54e+04 5.52e+04
Q3_Bangladesh -1.856e+04 8245.491 -2.251 0.024 -3.47e+04 -2399.743
Q3_Belarus -1.981e+04 9477.161 -2.091 0.037 -3.84e+04 -1236.754
Q3_Belgium 1.576e+04 9777.423 1.612 0.107 -3404.450 3.49e+04
Q3_Brazil -1.646e+04 3326.640 -4.947 0.000 -2.3e+04 -9935.268
Q3_Canada 3.155e+04 4656.782 6.775 0.000 2.24e+04 4.07e+04
Q3_Chile -1.002e+04 7351.459 -1.363 0.173 -2.44e+04 4394.150
Q3_China 0 4761.787 0 1.000 -9334.218 9334.218
Q3_Colombia -2.16e+04 5411.474 -3.991 0.000 -3.22e+04 -1.1e+04
Q3_Egypt -1.671e+04 6543.622 -2.553 0.011 -2.95e+04 -3882.060
Q3_France 5458.2950 4590.035 1.189 0.234 -3539.249 1.45e+04
Q3_Germany 2.831e+04 4115.214 6.879 0.000 2.02e+04 3.64e+04
Q3_Ghana 0 1.29e+04 0 1.000 -2.52e+04 2.52e+04
Q3_Greece -1.322e+04 7224.040 -1.830 0.067 -2.74e+04 940.223
Q3_India -1.443e+04 2406.524 -5.995 0.000 -1.91e+04 -9708.818
Q3_Indonesia -9261.6153 5922.366 -1.564 0.118 -2.09e+04 2347.610
Q3_Iran, Islamic Republic of... -2.696e+04 7183.472 -3.753 0.000 -4.1e+04 -1.29e+04
Q3_Ireland 1.652e+04 1.16e+04 1.424 0.155 -6224.274 3.93e+04
Q3_Israel 7.234e+04 7348.087 9.844 0.000 5.79e+04 8.67e+04
Q3_Italy -4654.4396 4603.029 -1.011 0.312 -1.37e+04 4368.575
Q3_Japan 0 3458.663 0 1.000 -6779.790 6779.790
Q3_Kenya -1.658e+04 6998.134 -2.370 0.018 -3.03e+04 -2866.607
Q3_Malaysia -6500.4962 8966.736 -0.725 0.468 -2.41e+04 1.11e+04
Q3_Mexico -1.351e+04 5144.463 -2.626 0.009 -2.36e+04 -3423.889
Q3_Morocco -1.785e+04 7726.225 -2.310 0.021 -3.3e+04 -2703.521
Q3_Nepal -8436.9915 1.22e+04 -0.692 0.489 -3.23e+04 1.54e+04
Q3_Netherlands 2.357e+04 5860.664 4.021 0.000 1.21e+04 3.51e+04
Q3_Nigeria -1.805e+04 4234.222 -4.264 0.000 -2.64e+04 -9754.066
Q3_Pakistan -1.472e+04 5610.908 -2.623 0.009 -2.57e+04 -3717.137
Q3_Peru -1.593e+04 7542.678 -2.113 0.035 -3.07e+04 -1149.421
Q3_Philippines -1.407e+04 8967.652 -1.568 0.117 -3.16e+04 3513.640
Q3_Poland -9419.0895 6064.639 -1.553 0.120 -2.13e+04 2469.023
Q3_Portugal -1.2e+04 6866.826 -1.748 0.081 -2.55e+04 1459.529
Q3_Republic of Korea 0 8531.707 0 1.000 -1.67e+04 1.67e+04
Q3_Romania -1.475e+04 9628.549 -1.532 0.126 -3.36e+04 4124.927
Q3_Russia -1.623e+04 3614.826 -4.489 0.000 -2.33e+04 -9142.232
Q3_Saudi Arabia 0 8741.415 0 1.000 -1.71e+04 1.71e+04
Q3_Singapore 1.75e+04 6506.101 2.690 0.007 4745.348 3.03e+04
Q3_South Africa 0 6662.449 0 1.000 -1.31e+04 1.31e+04
Q3_South Korea 6904.1116 6427.526 1.074 0.283 -5695.346 1.95e+04
Q3_Spain 0 4196.730 0 1.000 -8226.575 8226.575
Q3_Sri Lanka -1.331e+04 1.01e+04 -1.316 0.188 -3.31e+04 6511.672
Q3_Sweden 1.167e+04 7988.214 1.461 0.144 -3990.812 2.73e+04
Q3_Switzerland 7.296e+04 8536.824 8.547 0.000 5.62e+04 8.97e+04
Q3_Taiwan -1.232e+04 5474.407 -2.251 0.024 -2.31e+04 -1593.680
Q3_Thailand -2772.8678 7121.235 -0.389 0.697 -1.67e+04 1.12e+04
Q3_Tunisia -3333.6565 9939.030 -0.335 0.737 -2.28e+04 1.61e+04
Q3_Turkey -1.763e+04 4892.042 -3.605 0.000 -2.72e+04 -8043.898
Q3_Ukraine -1.466e+04 5639.376 -2.600 0.009 -2.57e+04 -3608.107
Q3_United Arab Emirates 3.039e+04 8521.341 3.567 0.000 1.37e+04 4.71e+04
Q3_United Kingdom of Great Britain and Northern Ireland 4.105e+04 3718.650 11.039 0.000 3.38e+04 4.83e+04
Q3_United States of America 7.705e+04 2570.367 29.974 0.000 7.2e+04 8.21e+04
Q3_Viet Nam -1.808e+04 6557.131 -2.757 0.006 -3.09e+04 -5227.686
Q6_1-2 years -1.276e+04 3448.824 -3.700 0.000 -1.95e+04 -5999.140
Q6_10-20 years 1.391e+04 3571.346 3.894 0.000 6907.881 2.09e+04
Q6_20+ years 2.35e+04 3674.774 6.395 0.000 1.63e+04 3.07e+04
Q6_3-5 years -9566.3037 3413.151 -2.803 0.005 -1.63e+04 -2875.728
Q6_5-10 years 0 3487.178 0 1.000 -6835.686 6835.686
Q6_< 1 years -1.233e+04 3526.267 -3.498 0.000 -1.92e+04 -5421.619
Q20_0-49 employees -8103.4404 1742.360 -4.651 0.000 -1.15e+04 -4688.006
Q20_10,000 or more employees 1.05e+04 1945.990 5.395 0.000 6683.776 1.43e+04
Q20_1000-9,999 employees 5315.3648 1979.819 2.685 0.007 1434.455 9196.274
Q20_250-999 employees 0 2190.298 0 1.000 -4293.497 4293.497
==============================================================================
Omnibus: 7603.734 Durbin-Watson: 2.000
Prob(Omnibus): 0.000 Jarque-Bera (JB): 317591.903
Skew: 3.939 Prob(JB): 0.00
Kurtosis: 31.232 Cond. No. 52.5
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
# run model on all data & compare. Keep in mind that the data should have been split into train and test set in order to
# compareresults effectively. This was not done here.
combined_data = qnums(['Q4', 'Q5', 'Q7', 'Q3', 'Q6', 'Q20'], model_dummies).drop(['Q4_I prefer not to answer',
'Q5_Other', 'Q7_OTHER_Other', 'Q3_Other',
'Q6_I have never written code',
'Q20_50-249 employees'], axis=1)
male_preds = final_M.predict(np.array(sm.add_constant(combined_data)))
female_preds = final_W.predict(np.array(sm.add_constant(combined_data)))
# add prediction values to the data
combined_data['male_preds'] = male_preds
combined_data['female_preds'] = female_preds
px.scatter(combined_data, x='male_preds', y='female_preds', trendline='ols', trendline_color_override='red')
Scatterplot showing the female predictions vs the male predictions. Hovering over the trend line we see that the formula for the straight line is:
female_predictor = 0.838546 * male_predictor - 2659.72
which shows that the female predictor predicts significantly less than the male predictor.
# combine with the approximate salary column and compare with predictor columns.
combined_data['aprox_salary'] = model_dummies.aprox_salary
combined_data.head()
| Q4_Bachelor’s degree | Q4_Doctoral degree | Q4_Master’s degree | Q4_No formal education past high school | Q4_Professional degree | Q4_Some college/university study without earning a bachelor’s degree | Q5_Business Analyst | Q5_DBA/Database Engineer | Q5_Data Analyst | Q5_Data Engineer | Q5_Data Scientist | Q5_Machine Learning Engineer | Q5_Product/Project Manager | Q5_Research Scientist | Q5_Software Engineer | Q5_Statistician | Q7_Part_1_Python | Q7_Part_2_R | Q7_Part_3_SQL | Q7_Part_4_C | Q7_Part_5_C++ | Q7_Part_6_Java | Q7_Part_7_Javascript | Q7_Part_8_Julia | Q7_Part_9_Swift | Q7_Part_10_Bash | Q7_Part_11_MATLAB | Q7_Part_12_None | Q3_Argentina | Q3_Australia | Q3_Bangladesh | Q3_Belarus | Q3_Belgium | Q3_Brazil | Q3_Canada | Q3_Chile | Q3_China | Q3_Colombia | Q3_Egypt | Q3_France | Q3_Germany | Q3_Ghana | Q3_Greece | Q3_India | Q3_Indonesia | Q3_Iran, Islamic Republic of... | Q3_Ireland | Q3_Israel | Q3_Italy | Q3_Japan | Q3_Kenya | Q3_Malaysia | Q3_Mexico | Q3_Morocco | Q3_Nepal | Q3_Netherlands | Q3_Nigeria | Q3_Pakistan | Q3_Peru | Q3_Philippines | Q3_Poland | Q3_Portugal | Q3_Republic of Korea | Q3_Romania | Q3_Russia | Q3_Saudi Arabia | Q3_Singapore | Q3_South Africa | Q3_South Korea | Q3_Spain | Q3_Sri Lanka | Q3_Sweden | Q3_Switzerland | Q3_Taiwan | Q3_Thailand | Q3_Tunisia | Q3_Turkey | Q3_Ukraine | Q3_United Arab Emirates | Q3_United Kingdom of Great Britain and Northern Ireland | Q3_United States of America | Q3_Viet Nam | Q6_1-2 years | Q6_10-20 years | Q6_20+ years | Q6_3-5 years | Q6_5-10 years | Q6_< 1 years | Q20_0-49 employees | Q20_10,000 or more employees | Q20_1000-9,999 employees | Q20_250-999 employees | male_preds | female_preds | aprox_salary | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 126872.821615 | 114828.899434 | 112499.5 |
| 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 29968.773101 | 34281.097398 | 17499.5 |
| 4 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 132427.010478 | 122032.800965 | 137499.5 |
| 9 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 94000.507165 | 84820.444771 | 74999.5 |
| 12 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 91886.090551 | 76772.188267 | 34999.5 |
# looking at the male and female predictor predictions at the different apporximate salaries
px.scatter(combined_data.sort_values('aprox_salary'), x = 'aprox_salary', y = ['male_preds','female_preds'])
We see that the male predictions pop up frequently at the top of the scatterplots. This is an indication that the male predictor was predicting higher salaries more frequently.
# looking at the difference in salaries between the male perdictions and female predictions
combined_data['projected_diff'] = combined_data.male_preds - combined_data.female_preds
combined_data.projected_diff.mean()
10059.919985464097
combined_data.projected_diff.std()
15363.821440407619
# converting negative values to 1 and positive values to 0 for easy comparison
combined_data['women_prj_higher'] = combined_data.projected_diff.apply(lambda x: 1 if x < 0 else 0)
# here, we see that the female predictor predicted a higher salary than the male predictor about a quarter of the time.
combined_data.women_prj_higher.value_counts()
0 8344 1 2211 Name: women_prj_higher, dtype: int64